def get_best_thread(self, question, tag_name):
        """
        Returns id of the most similar thread for the question.

        The search is performed across the threads with a given tag.

        Parameters
        ----------
        question : str
            The question asked
        tag_name : str
            The tag for the question

        Returns
        -------
        int
            The id of the most similar thread of the question
        """

        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        question_vec = question_to_vec(question=question,
                                       embeddings=self.word_embeddings,
                                       dim=thread_embeddings.shape[1])

        best_thread = pairwise_distances_argmin(question_vec[np.newaxis, ...],
                                                thread_embeddings,
                                                metric='cosine')

        return thread_ids[best_thread][0]
Exemplo n.º 2
0
    def get_best_thread(self, question, tag_name):
        """Returns id of the most similar thread for the question.
        The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        question_vec = question_to_vec(question, self.word_embeddings,
                                       self.embeddings_dim)
        best_thread = pairwise_distances_argmin(thread_embeddings,
                                                question_vec.reshape(1, -1))

        return thread_ids[best_thread[0]]
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        print(tag_name)
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.
        question_vec = question_to_vec(
            question, self.word_embeddings,
            self.embeddings_dim)  #### YOUR CODE HERE ####
        best_thread = pairwise_distances_argmin(
            [question_vec], thread_embeddings,
            metric='cosine')  #### YOUR CODE HERE ####

        return thread_ids[best_thread[0]]
def main():
    starspace_embeddings, embeddings_dim = load_embeddings(
        'word_embeddings.tsv')
    posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t')
    counts_by_tag = posts_df.groupby("tag")["post_id"].count()
    os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)

    for tag, count in counts_by_tag.items():
        tag_posts = posts_df[posts_df['tag'] == tag]

        tag_post_ids = tag_posts["post_id"].tolist()

        tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
        for i, title in enumerate(tag_posts['title']):
            tag_vectors[i, :] = question_to_vec(title, starspace_embeddings,
                                                embeddings_dim)

        # Dump post ids and vectors to a file.
        filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'],
                                os.path.normpath('%s.pkl' % tag))
        pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))
Exemplo n.º 5
0
    embeddings_dim = 300


posts_df = pd.read_csv('models\\tagged_posts.tsv', sep='\t')
gPosts = posts_df.groupby("tag").count()
counts_by_tag = dict(zip(gPosts.index,gPosts.post_id))

for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts.post_id.values
    titles = tag_posts.title.values
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)

    for i, title in enumerate(titles):
        tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join("thread_embeddings_by_tags", os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, titles, tag_vectors), open(filename, 'wb'))
'''
vectorizer = CountVectorizer(lowercase=True, stop_words="english", min_df=2)

dialogue_df = pd.read_csv('models\\dialogues.tsv', sep='\t')
posts_df = pd.read_csv('models\\tagged_posts.tsv', sep='\t')

corpus = list(dialogue_df['text'].values) + list(posts_df.title.values)

vectorizer.fit(corpus)
common_words = vectorizer.get_feature_names()
Exemplo n.º 6
0
pickle.dump(intent_recognizer, open(utils.RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

print('3. Programming language classification ...')
X = stackoverflow_df['title'].values
y = stackoverflow_df['tag'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))
vectorizer = pickle.load(open(utils.RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))
X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)
tag_classifier = OneVsRestClassifier(LogisticRegression(penalty="l2", C=5, random_state=0))
tag_classifier.fit(X_train_tfidf, y_train)
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))
pickle.dump(tag_classifier, open(utils.RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

print('4. Ranking questions with embeddings ...')
_, _, embeddings, _, embeddings_dim = utils.load_embeddings()
posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t')
counts_by_tag = posts_df.groupby("tag").count().max(axis=1)
os.makedirs(utils.RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)
for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    tag_post_ids = posts_df[posts_df['tag'] == tag].post_id
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = utils.question_to_vec(title, embeddings, embeddings_dim)
    # Dump post ids and vectors to a file.
    filename = os.path.join(utils.RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))