Python BertSemanticSearch примеры, Bert.basic_semantic_search.BertSemanticSearch Python примеры использования

Пример #1

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def benchmark_bert():
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    a2, followup_a2 = data_loader.questions_in_folder("assignment2",
                                                      index=True)

    # load BERT embeddings
    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    # set up dupe mapping
    dupes = load_pickle(dupe_path)
    dupes_map = create_duplicate_map(dupes)

    # evaluate
    num_correct = 0
    num_total = 0

    for i in range(len(a2)):
        idx, text = a2[i]
        pred_idx = bert_s_s.single_semantic_search(text, 4)
        pred_idx = [qs[int(pred_idx)][0] for pred_idx in pred_idx[1:]]

        # see if one of the indices in the top n is a dupe provided that the current question has a dupe
        if dupes_map.get(idx) is not None:
            num_total += 1

            for pidx in pred_idx:
                if pidx in dupes_map[idx]:
                    num_correct += 1
                    break

    return num_correct / num_total

Пример #2

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def compare_bert_and_piazza(top_n=3):
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("",
                                                      index=True,
                                                      timestamp=True,
                                                      qid=True)
    matches = load_json(piazza_match_path)
    id_to_idx = {q[-1]: q[0] for q in qs}

    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    num_overlap = [0, 0, 0, 0]

    for i in range(len(qs)):
        idx, text, timestamp, qid = qs[i]
        timestamp = timestamp.value // 10**9  # convert to seconds

        if matches.get(qid) is not None:

            # predictions from BERT
            bert_idx = bert_s_s.single_semantic_search(text, 100)
            bert_idx = [
                qs[int(pidx)][0] for pidx in bert_idx
                if qs[int(pidx)][2].value // 10**9 < timestamp
            ]
            bert_idx = bert_idx[:top_n]

            # predictions from piazza
            pred_idx = matches[qid]
            pred_idx = [p['id'] for p in pred_idx]  # only take ids
            pred_idx = [id_to_idx[p] for p in pred_idx
                        if p in id_to_idx]  # convert from ID to index
            pred_idx = pred_idx[:top_n]

            # counter overlaps
            overlap = 0
            for i in bert_idx:
                if i in pred_idx:
                    overlap += 1

            num_overlap[overlap] += 1

    # plot overlaps
    plt.bar(x=[i for i in range(4)], height=num_overlap)
    plt.xlabel("Number of overlaps")
    plt.ylabel("Number of samples")
    plt.title(
        "Overlaps Between BERT Predictions and Piazza Predictions for n={0}".
        format(top_n))
    plt.show()

Пример #3

0

Показать файл

 def __init__(self,
              user,
              password,
              class_id,
              corpus=None,
              corpus_embeddings=None,
              default_bert=True):
     self.p = Piazza()
     self.p.user_login(user, password)
     self.class_id = class_id
     self.user_profile = self.p.get_user_profile()
     self.network = self.p.network(class_id)
     self.DB_manger = MongoDBManger()
     self.bert = BertSemanticSearch(corpus, corpus_embeddings, default_bert)
     self.parallel_cid_list = []

Пример #4

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def followup_duplicates(top_n=3):
    """
    2019 spring CSC148, n = 3: 0.194
    """
    posts_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2019_2020-05-03\anon.contributions.csv"
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("",
                                                      index=True,
                                                      timestamp=True,
                                                      post_num=True)
    qidx = set([q[0] for q in qs])

    # load BERT embeddings
    bert_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2019_2020-05-03\corpus.pkl"
    bert_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2019_2020-05-03\corpus_embeddings.pkl"
    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    num_correct = 0
    num_total = 0

    for followup in followup_qs:
        idx, text, timestamp, post_num = followup
        if post_num not in qidx:
            continue

        timestamp = timestamp.value // 10**9  # convert to seconds

        pred_idx = bert_s_s.single_semantic_search(text, 100)
        pred_num = [
            qs[int(pidx)][3] for pidx in pred_idx
            if qs[int(pidx)][2].value // 10**9 < timestamp
        ]
        pred_num = pred_num[:top_n]

        for n in pred_num:
            if n == post_num:
                num_correct += 1
                break
        num_total += 1

    return num_correct / num_total

Пример #5

0

Показать файл

Файл: label_with_bert.py Проект: Karl-Cui/piazzabot

from data_loader import DataLoader
from labeler import Labeler

if __name__ == "__main__":
    posts_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\anon.contributions.csv"
    path_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus.pkl"
    path_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus_embeddings.pkl"
    label_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\Labeler.pkl"

    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    as2, followup_as2 = data_loader.questions_in_folder("assignment2", index=True)

    bert_s_s = BertSemanticSearch().from_files(path_corpus, path_corpus_embeddings)

    # label dataset
    labeler = Labeler(label_path)

    for i in range(len(as2)):
        idx, text = as2[i]
        choices_idx = bert_s_s.single_semantic_search(text, 10)

        labeler.label(
            text=text,
            text_idx=idx,
            choices=[qs[int(choice_idx)][1] for choice_idx in choices_idx],
            choices_idx=[qs[int(choice_idx)][0] for choice_idx in choices_idx]
        )
        print(labeler.labels)

Пример #6

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def bert_sim_score(top_n=3, time_window=None):
    """
    Concerning similarity scores:

    n = 1:
        mean:  0.68802536
        median:  0.6917961
        std:  0.076929025

    n = 2:
        mean:  0.6718048
        median:  0.67678297
        std:  0.07652894

    n = 3:
        mean:  0.6718048
        median:  0.67678297
        std:  0.07652894

    n = 4:
        mean:  0.65994626
        median:  0.6663551
        std:  0.07690837

    :return:
    """
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("",
                                                      index=True,
                                                      timestamp=True)
    a2, followup_a2 = data_loader.questions_in_folder("assignment2",
                                                      index=True,
                                                      timestamp=True)

    # load BERT embeddings
    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    # set up dupe mapping
    dupes = load_pickle(dupe_path)
    dupes_map = create_duplicate_map(dupes)

    # evaluate
    num_correct = 0
    num_total = 0
    score_cutoff_no_dupe = []
    score_cutoff_dupe = []

    for i in range(len(a2)):
        idx, text, timestamp = a2[i]
        timestamp = timestamp.value // 10**9  # convert to seconds

        pred_idx, cutoff = bert_s_s.single_semantic_search_with_similarity(
            text, 100)
        pred_idxs = []
        cutoffs = []

        # no time window given: check all posts that came before
        if time_window is None:
            for j in range(len(pred_idx)):
                pidx = pred_idx[j]

                if qs[int(pidx)][2].value // 10**9 < timestamp:
                    pred_idxs.append(qs[int(pidx)][0])
                    cutoffs.append(cutoff[j])

        # time window given: check posts within specified number of days of asked question
        else:
            for j in range(len(pred_idx)):
                pidx = pred_idx[j]

                if qs[int(pidx)][2].value // 10**9 < timestamp < qs[int(
                        pidx)][2].value // 10**9 + time_window * 24 * 3600:
                    pred_idxs.append(qs[int(pidx)][0])
                    cutoffs.append(cutoff[j])

        cutoff = min(cutoffs[:top_n])
        pred_idx = pred_idxs[:top_n]  # filter by top k entries

        # see if one of the indices in the top n is a dupe provided that the current question has a dupe
        found = False
        if dupes_map.get(idx) is not None:
            num_total += 1

            for pidx in pred_idx:
                if pidx in dupes_map[idx]:
                    num_correct += 1
                    found = True
                    score_cutoff_dupe.append(cutoff)
                    break

        if not found:
            score_cutoff_no_dupe.append(cutoff)
    """Score cutoff analysis"""
    score_cutoff_no_dupe = np.array(score_cutoff_no_dupe)
    score_cutoff_dupe = np.array(score_cutoff_dupe)

    # print("mean: ", np.mean(score_cutoff))
    # print("median: ", np.median(score_cutoff))
    # print("std: ", np.std(score_cutoff))

    # plot score cutoff
    plt.hist([score_cutoff_dupe, score_cutoff_no_dupe], bins=30, stacked=True)
    plt.legend(["Posts with duplicates", "Posts with no duplicates"])
    plt.xlabel("Similarity score")
    plt.ylabel("Number of samples")
    plt.title(
        "Distribution of similarity score cutoff for n={0}".format(top_n))
    plt.show()

    return num_correct / num_total

Пример #7

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def filter_window_bert(top_n=3, time_window=None):
    """
    n = 3
    ---------------------------------------
    Timestamp-agnostic:     0.8161
    Before current time:    0.5690
    2 weeks before:         0.5000

    Evaluate BERT predictions but only for posts before the time of the current post we are evaluating

    :param top_n: see if correct prediction is in top n predictions
    :param time_window: number of days before post to check for duplicates
    :return: duplicate detection accuracy
    """
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("",
                                                      index=True,
                                                      timestamp=True)
    a2, followup_a2 = data_loader.questions_in_folder("assignment2",
                                                      index=True,
                                                      timestamp=True)

    # load BERT embeddings
    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    # set up dupe mapping
    dupes = load_pickle(dupe_path)
    dupes_map = create_duplicate_map(dupes)

    # evaluate
    num_correct = 0
    num_total = 0

    for i in range(len(a2)):
        idx, text, timestamp = a2[i]
        timestamp = timestamp.value // 10**9  # convert to seconds

        pred_idx = bert_s_s.single_semantic_search(text, 100)

        # no time window given: check all posts that came before
        if time_window is None:
            pred_idx = [
                qs[int(pidx)][0] for pidx in pred_idx
                if qs[int(pidx)][2].value // 10**9 < timestamp
            ]

        # time window given: check posts within specified number of days of asked question
        else:
            pred_idx = [
                qs[int(pidx)][0] for pidx in pred_idx
                if qs[int(pidx)][2].value //
                10**9 < timestamp < qs[int(pidx)][2].value // 10**9 +
                time_window * 24 * 3600
            ]

        pred_idx = pred_idx[:top_n]  # filter by top k entries

        # see if one of the indices in the top n is a dupe provided that the current question has a dupe
        if dupes_map.get(idx) is not None:
            num_total += 1

            for pidx in pred_idx:
                if pidx in dupes_map[idx]:
                    num_correct += 1
                    break

    return num_correct / num_total

Пример #8

0

Показать файл

Файл: benchmark.py Проект: Karl-Cui/piazzabot

def bert_sim_score_threshold(time_window=None, threshold=0.):
    """
    Use threshold for cosine similarity
    """
    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("",
                                                      index=True,
                                                      timestamp=True)
    a2, followup_a2 = data_loader.questions_in_folder("assignment2",
                                                      index=True,
                                                      timestamp=True)

    # load BERT embeddings
    bert_s_s = BertSemanticSearch().from_files(bert_corpus,
                                               bert_corpus_embeddings)

    # set up dupe mapping
    dupes = load_pickle(dupe_path)
    dupes_map = create_duplicate_map(dupes)

    # evaluate
    num_correct = 0
    num_total = 0
    pred_entry_len_dupe = {}
    pred_entry_len_no_dupe = {}

    for i in range(len(a2)):
        idx, text, timestamp = a2[i]
        timestamp = timestamp.value // 10**9  # convert to seconds

        pred_idx = bert_s_s.single_semantic_search_using_threshold(
            text, 100, threshold=threshold)

        # no time window given: check all posts that came before
        if time_window is None:
            pred_idx = [
                qs[int(pidx)][0] for pidx in pred_idx
                if qs[int(pidx)][2].value // 10**9 < timestamp
            ]

        # time window given: check posts within specified number of days of asked question
        else:
            pred_idx = [
                qs[int(pidx)][0] for pidx in pred_idx
                if qs[int(pidx)][2].value //
                10**9 < timestamp < qs[int(pidx)][2].value // 10**9 +
                time_window * 24 * 3600
            ]

        # count number of entries
        num_entries = len(pred_idx)

        # see if one of the indices in the top n is a dupe provided that the current question has a dupe
        found = False
        if dupes_map.get(idx) is not None:
            num_total += 1

            for pidx in pred_idx:
                if pidx in dupes_map[idx]:
                    num_correct += 1
                    found = True
                    pred_entry_len_dupe[num_entries] = pred_entry_len_dupe.get(
                        num_entries, 0) + 1
                    break

        if not found:
            pred_entry_len_no_dupe[num_entries] = pred_entry_len_no_dupe.get(
                num_entries, 0) + 1

    x = [i for i in range(100)]
    y_dupe = [
        pred_entry_len_dupe[i] if i in pred_entry_len_dupe else 0
        for i in range(100)
    ]
    plt.bar(x=x, height=y_dupe)

    y_no_dupe = [
        pred_entry_len_no_dupe[i] if i in pred_entry_len_no_dupe else 0
        for i in range(100)
    ]
    plt.bar(x=x, height=y_no_dupe, bottom=y_dupe)

    plt.title(
        "Distribution of number of predictions for similarity threshold {0}".
        format(threshold))
    plt.xlabel("Number of predictions")
    plt.ylabel("Number of posts")
    plt.show()

    return num_correct / num_total

Пример #9

0

Показать файл

class PiazzaBot(object):
    def __init__(self,
                 user,
                 password,
                 class_id,
                 corpus=None,
                 corpus_embeddings=None,
                 default_bert=True):
        self.p = Piazza()
        self.p.user_login(user, password)
        self.class_id = class_id
        self.user_profile = self.p.get_user_profile()
        self.network = self.p.network(class_id)
        self.DB_manger = MongoDBManger()
        self.bert = BertSemanticSearch(corpus, corpus_embeddings, default_bert)
        self.parallel_cid_list = []

    def heart_beat(self):
        """
        triggers the heart beat code which process all new posts and puts the data for them into the db and also
        make new postings and suggestions for posts in our

        :return: NA
        """
        posts = self.network.iter_all_posts()
        for post in posts:
            try:
                cid = post["id"]
                query = {"cid": cid}
                result = self.DB_manger.find(query)
                db_dict = self.create_db_dict(post, result)

                # TODO: remove HTML tags

                if result is None and db_dict is not None:
                    self.DB_manger.insert(db_dict)
                    if not db_dict["is_marked"]:
                        self.create_piazza_bot_follow_up(
                            cid, "Piazza Bot is trying to process this post")
                        self.make_private(db_dict)

                elif db_dict is not None:
                    if not db_dict["is_processed"] and db_dict[
                            "is_marked"] and len(self.parallel_cid_list) != 0:
                        self.make_piazza_suggestions(db_dict, cid)
                    elif not db_dict["is_marked"]:
                        print("here")
                        self.create_piazza_bot_follow_up(
                            cid, "Piazza Bot is trying to process this post")
                        self.make_private(db_dict)

                    self.DB_manger.insert_update(query, db_dict)
                    # update the value in the db if the change_log or history has changed
                    if db_dict["change_log_len"] > result[
                            "change_log_len"] or db_dict["revision"] > result[
                                "revision"]:
                        self.DB_manger.insert_update(query, db_dict)

            except KeyError:
                print("no cid")

    def generate_embeddings(self):
        """
        generate the embeddings for all the current posts in the data base

        :return: NA
        """
        docs = self.DB_manger.get_all()
        if docs is None:
            return 1

        corpus = []
        corpus_embeddings = []
        parallel_cid_list_local = []
        for doc in docs:
            corpus.append(doc["content"])
            corpus_embeddings.append(pickle.loads(doc["encoding"]))
            parallel_cid_list_local.append(doc["cid"])

        # turn list of loaded tensors to a single tensor
        corpus_embeddings = [
            torch.unsqueeze(t, dim=0) for t in corpus_embeddings
        ]
        corpus_embeddings = torch.cat(corpus_embeddings, dim=0)

        self.bert.set_corpus(corpus)
        self.bert.set_corpus_embeddings(corpus_embeddings)
        self.parallel_cid_list = parallel_cid_list_local

    def create_db_dict(self, post, old_post, tensor=True):
        """
        generate the embeddings for all the current posts in the data base

        :param post: the new post json data we want to process into a dict we can put into the db
        :param old_post: old db value for the current post
        :return: post dict formatted for the DB
        """
        try:
            cid = post["id"]
            history = post["history"]
            change_log_len = len(post["change_log"])
            revision = len(history)
            cur_post_content = history[-1]
            uid = self.find_uid(cur_post_content)
            if "gd6v7134AUa" == uid:
                return None

            post_type = post["type"]
            post_folders = post['folders']
            post_subject = cur_post_content['subject']
            post_content = cur_post_content['content']
            is_marked_by_pb, is_processed, mark_id = self.is_marked_by_piazza_bot(
                post["children"], old_post)

            new_value = {
                "cid": cid,
                "revision": revision,
                "change_log_len": change_log_len,
                "uid": uid,
                "type": post_type,
                "folders": post_folders,
                "subject": post_subject,
                "content": post_content,
                "is_marked": is_marked_by_pb,
                "mark_id": mark_id,
                "is_processed": is_processed
            }
            # generate a new embedding if this is first time this post is being added to the db or if there was a content update
            if tensor and (old_post is None
                           or revision > old_post["revision"]):
                encoding = pickle.dumps(self.bert.encode_content(post_content))
                print(encoding)
                new_value["encoding"] = encoding
            return new_value

        except KeyError as e:
            print(e)
            return None

    def is_marked_by_piazza_bot(self, children, old_post):
        """
        figure out of the current post has been marked by the bot and processed. if the current post has been marked
        then get the cid for the marking follow up
        :param children: current children posts(follow ups) for the current post
        :param old_post: old db value for the current post
        :return: boolean, boolean, cid
        """
        len_children = len(children)
        if len_children == 0:
            print("getting childern len 0")
            return False, False, "None"

        for follow_up in children:
            if follow_up['type'] == "i_answer":
                return True, True, "None"

            subject = follow_up['subject']
            if subject == "Piazza Bot is trying to process this post":
                return True, False, follow_up['id']
            elif subject == "Piazza Bot Has Processed this post":
                return True, True, follow_up['id']
            elif len(subject
                     ) > 24 and subject[:24] == '<p><b>Piazza Bot</b></p>':
                return True, False, follow_up['id']

        if old_post is not None and old_post["is_marked"]:
            return True, True, old_post["mark_id"]

        return False, False, "None"

    def make_private(self, db_dict):
        """
        make the post associate with the current db dict object private
        :param db_dict: db dict object of the post we want to make private
        :return: 1 if successful else 0
        """
        try:
            if "gd6v7134AUa" != db_dict["uid"]:
                self.update_post(db_dict["cid"], db_dict["type"],
                                 db_dict["revision"], db_dict["folders"],
                                 db_dict["subject"], db_dict["content"], False)

            return 1
        except KeyError:
            return 0

    def make_suggestion_string(self, cur_cid, post_cid):
        link = '<p><a href="https://piazza.com/class/kg9odngyfny6s9?cid={}" target="_blank" rel="noopener">Potential Duplicate of @{}</a></p>'.format(
            cur_cid, cur_cid)
        mark_dup = '<p><a href="http://127.0.0.1:5000/api/dup/{}/{}" target="_blank" rel="noopener">Mark Current Post as Duplicate of @{}</a>'.format(
            post_cid, cur_cid, cur_cid)
        mark_followup = 'or <a href="http://127.0.0.1:5000/api/followup/{}/{}" target="_blank" rel="noopener">Mark Current Post as Follow up of @{}</a></p>'.format(
            post_cid, cur_cid, cur_cid)
        return link + mark_dup + mark_followup

    def make_piazza_suggestions(self, db_dict, cid):
        #TODO add getting suggestions code
        msg = '<p><b>Piazza Bot</b></p><p><a href="http://127.0.0.1:5000/api/post/{}" target="_blank" rel="noopener">Make Post Public</a></p>'.format(
            cid)

        try:
            if "gd6v7134AUa" != db_dict["uid"]:
                topk_idxs = self.bert.single_semantic_search(
                    db_dict["content"], top_k=3)
                topk_cids = [self.parallel_cid_list[idx] for idx in topk_idxs]

                for dup_cid in topk_cids:
                    if dup_cid != cid:
                        msg += self.make_suggestion_string(dup_cid, cid)

                self.update_follow_up(db_dict["mark_id"], msg)

            return 1
        except KeyError:
            return 0

    def find_uid(self, cur_post_content):
        """
        find the uid from the most latest post history(content)

        :param cur_post_content: the content params fot he post we are working on
        :return: the uid for the user who made the last edit on this post
        """
        try:
            uid = cur_post_content["uid"]
        except KeyError:
            uid = ""
        return uid

    def create_post(self,
                    post_folders,
                    post_subject,
                    post_content,
                    post_type="question",
                    is_announcement=0,
                    bypass_email=0,
                    anonymous=False):
        """
        For simulating asking a question on Piazza. See the Piazza package for full documentation.

        NOTE: post_folders is actually a list of string, not a single string
        """
        info = self.network.create_post(post_type=post_type,
                                        post_folders=post_folders,
                                        post_subject=post_subject,
                                        post_content=post_content,
                                        is_announcement=is_announcement,
                                        bypass_email=bypass_email,
                                        anonymous=anonymous)
        return info

    def update_post(self,
                    cid,
                    post_type,
                    revision,
                    post_folders,
                    post_subject,
                    post_content,
                    visibility_all=True):
        """Update a post

        :param cid: cid of the post we want to update
        :param post_type: the type we want to change the post to "note", "question" or "poll"
        :param revision:
        :param post_folders:
        :param post_subject:
        :param post_content:
        :param visibility_all: change post visibility from all to just the instructors and original poster
        :return: if the post update was successful
        """

        params = {
            "cid": cid,
            "subject": post_subject,
            "content": post_content,
            "folders": post_folders,
            "type": post_type,
            "revision": revision,
            "visibility": "all" if visibility_all else "private"
        }
        print(params)
        return self.network._rpc.content_update(params)

    def create_piazza_bot_follow_up(self, cid, content, ionly=False):
        """Create a follow-up on a post.

        :param cid: cid of the post we want to add this follow up too
        :param content: content of the follow up post
        :param ionly: make the visibility of the follow only instructors
        :return: follow up was created
        """

        params = {
            "cid": cid,
            "type": "followup",
            "subject": content,
            "content": "",
        }
        if ionly:
            params["config"] = {"ionly": True},
        return self.network._rpc.content_create(params)

    def update_follow_up(self, followup_post, content):
        """update a follow-up on a post

        :param followup_post: json of the follow up post
        :param content: content of the follow up post
        :return: if the follow up post was successful updated
        """
        self.network.update_post(followup_post, content)

    def get_post(self, cid):
        """ retrieve data for a certain post

        :param cid: cid of the post of you want to retrieve data for
        :return: if the post update was successful
        """
        return self.network.get_post(cid)

    def get_post_from_db(self, cid):
        """ retrieve data from the db for a certain post

        :param cid: cid of the post of you want to retrieve data for
        :return: Mongo result object
        """
        query = {"cid": cid}
        return self.DB_manger.find(query)

    def mark_as_duplicate(self,
                          duplicated_cid,
                          master_cid,
                          msg='Piazza bot found this Duplicate'):
        """ make the given post as duplicate of another

        :param duplicated_cid: cid of the post of you want to make as duplicate
        :param master_cid: cid of the post of you want to put the duplicate under
        :param msg: msg for why the post is marked as a duplicate
        :return: if the duplicate mark request was successful
        """
        self.network.mark_as_duplicate(duplicated_cid, master_cid, msg)

    def delete_post(self, cid):
        """ delete a post from piazza

        :param cid: cid of the post of you want to delete
        :return: if the delete request was successful
        """
        self.network.delete_post(cid)

    def delete_post_db(self, cid):
        """ delete a post from the db

        :param cid: cid of the post of you want to delete
        :return: Mongo result object
        """
        return self.DB_manger.del_by_cid(cid)

    def get_piazza_suggestions(self, query):
        params = {"nid": self.class_id, "query": query}
        r = self.network._rpc.request(method="network.find_similar",
                                      data=params)
        return self.network._rpc._handle_error(
            r, "Could not get suggestions {}.".format(repr(params)))

    def get_full_piazza(self):
        posts = self.network.iter_all_posts()
        dataframe_cols = ["cid", "content", "match 1", "match 2", "match 3"]
        dataframe = pd.DataFrame(columns=dataframe_cols)
        for post in posts:
            db_dict = self.create_db_dict(post, None, False)
            if db_dict is None:
                continue
            content = db_dict["content"]
            result = self.get_piazza_suggestions(content)
            cid = db_dict["cid"]
            try:
                suggestions = result["list"]
            except KeyError:
                continue

            counter = 0
            new_row = {
                "cid": cid,
                "content": content,
                "match 1 cid": "None",
                "match 2 cid": "None",
                "match 3 cid": "None"
            }
            for suggestion in suggestions:
                if suggestion['id'] != cid:
                    if counter == 0:
                        new_row["match 1 cid"] = suggestion['id']
                    elif counter == 1:
                        new_row["match 2 cid"] = suggestion['id']
                    elif counter == 2:
                        new_row["match 3 cid"] = suggestion['id']
                        break
                    counter += 1

            dataframe = dataframe.append(new_row, ignore_index=True)

        dataframe.to_csv(
            r"C:\Users\sohai\Documents\Uni 2020\csc392\piazzabot\data\paizza_api_matchs.csv"
        )

Python BertSemanticSearch примеры использования