def transform(self, X):
     list_of_emb = []
     size_of_emb = stacked_embeddings.embedding_length
     if not isinstance(X, str):
         for doc in X:
             p_str = parse_string(doc)
             if not p_str:
                 list_of_emb.append(
                     np.zeros((size_of_emb, ), dtype=np.float32)
                 )  ##TODO: don't hard code vector size
             else:
                 a_set = Sentence(p_str)
                 stacked_embeddings.embed(a_set)
                 list_of_emb.append(
                     a_set.get_embedding().cpu().detach().numpy())
         to_ret = np.array(list_of_emb)
     else:
         try:
             p_str = parse_string(X)
             if not p_str:
                 to_ret = np.zeros((size_of_emb, ),
                                   dtype=np.float32)  ##TODO here too
             else:
                 a_set = Sentence(p_str)
                 stacked_embeddings.embed(a_set)
                 to_ret = a_set.get_embedding().cpu().detach().numpy(
                 ).reshape(1, -1)
         except:
             print(type(X))
             print(X)
     return to_ret
Exemplo n.º 2
0
def cosine_embedding(sentence1, sentence2, model):
    embeddings = DocumentPoolEmbeddings(model, mode='mean')
    s1 = Sentence(sentence1)
    s2 = Sentence(sentence2)
    e1 = embeddings.embed(s1)
    e2 = embeddings.embed(s2)
    v1 = s1.get_embedding()
    v2 = s2.get_embedding()
    #print(v1, v2)   #check that you don't get empty tensors

    cos_sim = dot(v1, v2) / (norm(v1) * norm(v2))
    #print(cos_sim)
    return cos_sim  #lies in [-1, 1].
Exemplo n.º 3
0
def is_difference_large(text1, text2):
    text1_preproccessed = Sentence(text1)
    text2_preproccessed = Sentence(text2)

    glove_embedding = WordEmbeddings('glove')
    document_embedding = DocumentPoolEmbeddings([glove_embedding])
    document_embedding.embed(text1_preproccessed)
    document_embedding.embed(text2_preproccessed)

    text1_embedding = text1_preproccessed.get_embedding()
    text2_embedding = text2_preproccessed.get_embedding()
    text1_embedding = np.reshape(text1_embedding, (-1, 1))
    text2_embedding = np.reshape(text2_embedding, (-1, 1))
    similarity = cosine_similarity(text1_embedding, text2_embedding)
    print(np.mean(similarity))
Exemplo n.º 4
0
 def embedize(self, data_subset_list):
     tweet = Sentence(data_subset_list)
     embedding = TransformerDocumentEmbeddings(self.embedding)
     embedding.embed(tweet)
     tweet_emb = tweet.get_embedding()
     tweet_emb_np = tweet_emb.detach().numpy()
     return (tweet_emb_np)
Exemplo n.º 5
0
def embed(card_tag, card_as_sentence, card_words, card_words_org):
    stacked_embeddings.embed(card_tag)
    #stacked_embeddings.embed(card_as_sentence)
    #print(card_as_sentence.get_embedding().reshape(1,-1))
    word_list = []
    token_removed_ct = 0
    card_tag_emb = card_tag.get_embedding()
    if sent_level == False:
        for word, count in zip(card_words_org, range(0, len(card_words_org))):
            n_gram_word = card_words[count]
            stacked_embeddings.embed(n_gram_word)
            n_gram_emb = n_gram_word.get_embedding()
            if graph:
                doc_embeddings.append(n_gram_emb.numpy())
            word_sim = cos(card_tag_emb.reshape(1, -1),
                           n_gram_emb.reshape(1, -1))
            word_tup = (card_words_org[count], word_sim
                        )  #card_words_org[count]
            word_list.append(word_tup)
        if graph:
            doc_embeddings.append(card_tag_emb.numpy())
        print(len(word_list))
        print(len(card_words))
        print(len(card_words_org))
    else:
        for sentence in card_as_sentence:
            set_obj = Sentence(sentence)
            stacked_embeddings.embed(set_obj)
            sentence_emb = set_obj.get_embedding()
            word_sim = cos(card_tag_emb.reshape(1, -1),
                           sentence_emb.reshape(1, -1))
            sentence_tup = (sentence, word_sim)
            word_list.append(sentence_tup)
    return word_list
Exemplo n.º 6
0
def generate_topics_on_series(series):
    """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f

    Returns:
        [type]: [description]
    """
    validate_text(series)

    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings("news-forward")
    flair_embedding_backward = FlairEmbeddings("news-backward")
    bert_embedding = BertEmbeddings("bert-base-uncased")

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(series.index), 7168)).cuda()

    # fill tensor with embeddings
    i = 0
    for text in tqdm(series):
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

    X = X.cpu().detach().numpy()
    torch.cuda.empty_cache()

    return X
Exemplo n.º 7
0
    def predict(self, X: List[str], n_labels: int = 10) -> np.array:

        if not hasattr(self, 'tag_embeddings_'):
            raise NotFittedError

        if self.verbose:
            X_iterator = tqdm(
                X, desc='Computing embeddings for prediction samples')
        else:
            X_iterator = X

        X_embeddings = []

        for doc in X_iterator:
            doc_obj = Sentence(doc)
            self.document_embedder_.embed(doc_obj)
            X_embeddings.append(doc_obj.get_embedding().detach().numpy())

        nn = NearestNeighbors(metric=self.distance_metric,
                              n_neighbors=n_labels,
                              n_jobs=self.n_jobs)
        nn.fit(self.tag_embeddings_)

        y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]),
                            dtype='int8')

        for sample_ind, text_embedding in enumerate(X_embeddings):
            nearest_neighbors = nn.kneighbors([text_embedding])[1][0]
            y_pred[sample_ind, nearest_neighbors] = 1

        return y_pred.tocsr()
Exemplo n.º 8
0
    def fit(self, X, y):

        tag_docs = self._create_tag_corpus(X, self._create_tag_docs(y))

        self.document_embedder_ = DocumentPoolEmbeddings(
            self.word_embeddings,
            pooling=self.pooling,
            fine_tune_mode=self.fine_tune_mode)

        if self.verbose:
            doc_iterator = tqdm(tag_docs, desc='Computing tag embeddings')
        else:
            doc_iterator = tag_docs

        self.tag_embeddings_ = []

        for doc in doc_iterator:
            doc_obj = Sentence(doc)
            self.document_embedder_.embed(doc_obj)
            self.tag_embeddings_.append(
                doc_obj.get_embedding().detach().numpy())

        self.tag_embeddings_ = np.array(self.tag_embeddings_)

        return self
Exemplo n.º 9
0
def phrase_to_docvec(phrase: str, doc_embedding=doc_embedding):

    # need to crop the phrase to the first 2000 characters unless I want to look up more things. Should be good enough.
    phrase_s = Sentence(phrase[:2000].lower(), use_tokenizer=True)  # need to lower, since we're using the uncased version
    doc_embedding.embed(phrase_s)
    phrase_emb_tensor = phrase_s.get_embedding()  # get_embedding() returns a torch.FloatTensor

    return np.array(phrase_emb_tensor.data)
Exemplo n.º 10
0
def get_vector(sen, document_embeddings):
    """Get the document vector for input."""
    sentence = Sentence(sen)
    document_embeddings.embed(sentence)
    get_embed = sentence.get_embedding().detach()
    if get_embed.device.type == 'cuda':
        return get_embed.cpu().numpy()
    return get_embed.numpy()
Exemplo n.º 11
0
 def get_similarities(self, query):
     """
     get the similarities between the query and all docs in corpus (WordEmbeddingModel)
     :param query:
     :return: the list of similarities [sim_between_query_and_first_doc,sim_between_query_and_second_doc...]
     """
     seq = Sentence(query)
     self.vectorizer.embed(seq)
     query_vector = seq.get_embedding().tolist()
     return cosine_similarity([query_vector], self.X).flatten()
Exemplo n.º 12
0
def test_fine_tunable_flair_embedding():
    language_model_forward = LanguageModel(Dictionary.load(
        'chars'), is_forward_lm=True, hidden_size=32, nlayers=1)
    embeddings = DocumentRNNEmbeddings([FlairEmbeddings(
        language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False)
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 128)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
    embeddings = DocumentLMEmbeddings(
        [FlairEmbeddings(language_model_forward, fine_tune=True)])
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 32)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 13
0
def compute_embedding(embedding, remove_punctuation: bool, file_name: str):
    """
    Computes the embedding with given model for all arguments
    :param embedding: Model
    :param remove_punctuation: Bool to indicate if punctuation should be removed
    :param file_name:
    """
    arguments = Arguments()
    document_embedding = DocumentPoolEmbeddings([embedding])

    embedded_arguments = {}

    for argument in arguments.ground_truth_arguments:
        premises = argument['premises']
        conclusion = argument['conclusion']

        conclusion_text = conclusion['conclusion_text']
        if remove_punctuation:
            conclusion_text = remove_punctuations(conclusion_text)
        conclusion_sentence = Sentence(conclusion_text)
        document_embedding.embed(conclusion_sentence)
        embedded_conclusion = conclusion_sentence.get_embedding().detach(
        ).numpy().tolist()

        embedded_premises = {}
        argument_uid = None

        for premise in premises:
            premise_text = premise[1]
            if remove_punctuation:
                premise_text = remove_punctuations(premise_text)
            premise_sentence = Sentence(premise_text)
            document_embedding.embed(premise_sentence)
            embedded_premise = premise_sentence.get_embedding().detach().numpy(
            ).tolist()
            embedded_premises[premise[2]] = embedded_premise
            argument_uid = premise[0]
        embedded_arguments[argument_uid] = [
            embedded_conclusion, embedded_premises
        ]

        save_embedding(embedded_arguments, file_name)
Exemplo n.º 14
0
def is_difference_large(text1: str, text2: str) -> bool:
    text1_preprocessed = Sentence(text1)
    text2_preprocessed = Sentence(text2)

    glove_embedding = WordEmbeddings('glove')
    document_embedding = DocumentPoolEmbeddings([glove_embedding])
    document_embedding.embed(text1_preprocessed)
    document_embedding.embed(text2_preprocessed)

    text1_embedding = text1_preprocessed.get_embedding()
    text2_embedding = text2_preprocessed.get_embedding()
    text1_embedding = np.reshape(text1_embedding, (-1, 1))
    text2_embedding = np.reshape(text2_embedding, (-1, 1))
    similarity = cosine_similarity(text1_embedding, text2_embedding)
    # Some example treshold
    # TODO: Determine a good threshold for example in pitch
    if np.mean(similarity) > 0.06:
        return False
    else:
        return True
def getBertVector(str):
    # create a sentence
    #print(str)
    sentence = Sentence(str)

    # embed words in sentence
    document_embeddings.embed(sentence)
   
    # print(str)
    # print(sentence.get_embedding().detach().numpy())
    
    return sentence.get_embedding().detach().numpy()
def create_corpus(pdf_folder):
    corpus = []
    document_embeddings = embedding()

    for file1 in os.listdir(pdf_folder):
        if file1.endswith(".pdf"):
            pdf = pdfparser(pdf_folder + file1)
            sentence = Sentence(pdf)
            document_embeddings.embed(sentence)
            corpus.append(sentence.get_embedding().detach().numpy())

    #Save corpus to a pickle file
    with open('corpus.pkl', 'wb') as f:
        pickle.dump(corpus, f)
Exemplo n.º 17
0
def similaridade(frase, tipo_embeding):

	sentence = Sentence(frase, use_tokenizer=True)
	embeddings[tipo_embeding].embed(sentence)

	vetor_frase = sentence.get_embedding()

	lista_similaridade = []
	for i in dicio_vetores[tipo_embeding]: # Primeira posição do vetor é a frase e a segunda o vetor de embedings
		lista_similaridade.append((arcos(vetor_frase,i[1]), i[0]))

	lista_similaridade.sort(reverse = False)

	return lista_similaridade[:10]
Exemplo n.º 18
0
def createFlairEmbeddings(embedding_list, data):

    embeddings = []

    sentences = data['Interest_Name'].values

    model = DocumentPoolEmbeddings(embedding_list, fine_tune_mode='nonlinear')
    if __name__ == "__main__":
        for sent in sentences:
            sentence = Sentence(sent)
            model.embed(sentence)
            modeled_embedding = sentence.get_embedding()
            array = modeled_embedding.cpu().detach().numpy()
            embeddings.append(array)

    return embeddings
Exemplo n.º 19
0
def get_vector(sen, document_embeddings):
    """Get the document vector for input."""
    if isinstance(sen, pd.Series):
        sentence = [Sentence(i) for i in sen]
        document_embeddings.embed(sentence)
        get_embed = [i.get_embedding().detach() for i in sentence]
        if any([i.device.type == 'cuda' for i in get_embed]):
            return [x.cpu().numpy() for x in get_embed]
        return [x.numpy() for x in get_embed]
    else:
        sentence = Sentence(sen)
        document_embeddings.embed(sentence)
        get_embed = sentence.get_embedding().detach()
        if get_embed.device.type == 'cuda':
            return get_embed.cpu().numpy()
        return get_embed.numpy()
Exemplo n.º 20
0
def preprocess_line(original_line,
                    lem=True,
                    stem=True,
                    embed=True,
                    remove_stop_words=True,
                    extra_features=False):
    tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")

    # lower sent
    line = original_line.lower()

    line = tokenizer.tokenize(line)

    if extra_features:
        features = _extract_features(original_line, line)

    if lem:
        lemmatizer = WordNetLemmatizer()
        line = [lemmatizer.lemmatize(word) for word in line]

    if stem:
        stemmer = PorterStemmer()
        line = [stemmer.stem(word) for word in line]

    if remove_stop_words:
        stop_words = set(stopwords.words('english'))
        new_line = [word for word in line if not word in stop_words]
        # n_stop_words
        if extra_features:
            features.append(len(line) - len(new_line))
        line = new_line

    line = " ".join(line)

    if embed:
        try:
            sentence = Sentence(line)
            _EMBEDDER.embed(sentence)
            line = sentence.get_embedding().cpu().detach().numpy()
        except Exception:
            return None

    if extra_features:
        # concat features at the end!
        np.concatenate((line, np.asarray(features)))

    return line
Exemplo n.º 21
0
    def decision_function(self, X: List[str], n_labels: int = 10):

        if not hasattr(self, 'tag_embeddings_'):
            raise NotFittedError

        if self.verbose:
            X_iterator = tqdm(
                X, desc='Computing embeddings for prediction samples')
        else:
            X_iterator = X

        X_embeddings = []

        for doc in X_iterator:
            if doc:
                doc_obj = Sentence(doc)
            else:
                doc_obj = Sentence('Unkown')
                print('yeah')
            self.document_embedder_.embed(doc_obj)
            try:
                X_embeddings.append(doc_obj.get_embedding().detach().numpy())
            except RuntimeError as e:
                print(
                    'Could no compute embedding for sample inserting zero vector'
                )
                # TODO give index of corrupted sample
                print(e)
                X_embeddings.append(
                    np.zeros((self.tag_embeddings_[1], ),
                             dtype=self.tag_embeddings_.dtype))

        nn = NearestNeighbors(metric=self.distance_metric,
                              n_neighbors=n_labels,
                              n_jobs=self.n_jobs)
        nn.fit(self.tag_embeddings_)

        y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]),
                            dtype='float')

        for sample_ind, sample_vec in enumerate(X_embeddings):
            distances, indices = nn.kneighbors([sample_vec])
            for distance, label_index in zip(distances, indices):
                y_pred[sample_ind, label_index] = distance

        return y_pred.tocsr()
Exemplo n.º 22
0
def compute_pretrained_individual_transformer_embedding(
        query, word_embedding, document_embeddings):
    """
       :param query: String :: arbitrary sentence
       :param word_embedding
       :param document_embeddings
       :return: n-dimensional embedding
    """
    tokenized_text = word_embedding.tokenizer.tokenize(query)
    tokenized_len = len(tokenized_text)
    while tokenized_len > 512:
        query = query[:len(query) // 2]
        tokenized_text = word_embedding.tokenizer.tokenize(query)
        tokenized_len = len(tokenized_text)
    sentence = Sentence(query)
    document_embeddings.embed(sentence)
    tensor_query_embedding = sentence.get_embedding()
    numpy_query_embedding = tensor_query_embedding.data.cpu().numpy()
    return numpy_query_embedding
Exemplo n.º 23
0
    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """

        X = X['text']

        dataset_hash = hash(str(X) + str(self.embedder.__dict__))
        if dataset_hash in self.dataset_cache:
            return self.dataset_cache[dataset_hash]
        else:
            embeddings = []

            for first in trange(0, len(X), self.batch_size):
                subset = X[first:first + self.batch_size]
                sentences = []
                for element in subset:
                    sentence = Sentence(element)
                    # sentence.tokens = sentence.tokens[:200]
                    sentences.append(sentence)

                self.embedder.embed(sentences)
                for sentence in sentences:
                    key = sentence.to_original_text()
                    if key in self.vector_cache.keys():
                        vector = self.vector_cache[key]
                    else:
                        vector = sentence.get_embedding().cpu().detach().numpy(
                        )
                        self.vector_cache[key] = vector
                    embeddings.append(vector)

            embedding_dataset = numpy.vstack(embeddings)
            self.dataset_cache[dataset_hash] = embedding_dataset
            return embedding_dataset
Exemplo n.º 24
0
    def embedd_document_p(self, document: str,
                          doc_id: str) -> Tuple[Tensor, str]:
        flair_doc = Sentence(document)

        self.document_embedding.embed(flair_doc)
        return flair_doc.get_embedding().detach().numpy(), doc_id
Exemplo n.º 25
0
 def _embed_document(self, document_text: str,
                     doc_embeddings: DocumentPoolEmbeddings):
     sentence = Sentence(document_text)
     doc_embeddings.embed(sentence)
     return sentence.get_embedding().data.cpu().numpy()
Exemplo n.º 26
0
# + hidden=true
lyrics_embeddings = all_song_lyrics.loc[:, ['key', 'lyrics_clean']].set_index(
    'key')

all_embeddings = {}
count = 0
for i in lyrics_embeddings.index.tolist():
    count += 1
    if count % 100 == 0:
        print(count)
        text = lyrics_embeddings.loc[i, 'lyrics_clean']
        try:
            sentence = Sentence(text)
            document_embeddings.embed(sentence)
            numpy_array = sentence.get_embedding().detach().numpy()
            all_embeddings[i] = numpy_array
        except:
            all_embeddings[i] = [numpy_array]

flair_doc_embeddings = pd.DataFrame.from_dict(all_embeddings, orient='index')
flair_doc_embeddings.to_csv('flair_embeddings.csv')
# -

# ### Final dataset

os.chdir(data_folder)
flair_sentiment_df = pd.read_csv('./flair_sentiment.csv').set_index('key')
flair_sentiment_df.columns = ['flair_sentiment']

song_lyrics = all_song_lyrics.set_index('spotify_id').copy()
Exemplo n.º 27
0
def training_pipeline_bert(filepath=None,
                           num_words_to_print=10,
                           prefix=None,
                           min_topics=19,
                           max_topics=19,
                           step=2):

    logging.info(f'Started training_pipeline : {min_topics}-{max_topics}')
    start = datetime.datetime.now()

    if filepath is not None:
        filepath = data_dir_local / filepath
    else:
        logging.error("Please enter file name")
        exit()
    if max_topics is None:
        logging.error("Please enter a valid topic number to train model")
        exit()

    #logging.info(f'preprocessor.process_data_save: {filepath}')
    #preprocessor.process_data_save(filepath=filepath, as_text=as_text, as_pickle=as_pickle, verbose=verbose)
    #logging.info(f'phraser.raw_to_phrased_data_pipeline...')
    #phraser.raw_to_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None)
    col = cols[0]
    df = phraser.load_phrased_data_pipeline(to_load='text',
                                            verbose=True,
                                            overwrite_interim=True,
                                            prefix=None,
                                            training=True,
                                            col='resp_whytfa')

    #if prefix is None:
    #      prefix = ''
    # for topic modeling
    #trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt'
    #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/{prefix}{col}_transformed_docs_all.txt'
    #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/processed_dataframe.csv'

    #print(f'Loading input file {trigram_docs_filepath}')
    # turn to posix filepaths until gensim supports this
    #trigram_docs_filepath =  trigram_docs_filepath.as_posix()

    #trigram_docs = LineSentence(trigram_docs_filepath)
    #df = pd.read_csv(trigram_docs_filepath)
    #print(df.columns)

    #default it to min/max topics
    num_topics_range = range(min_topics, max_topics + 1, step)
    #if num_topics is not None:
    #    num_topics_range = range(num_topics, num_topics + 1, step)
    print('Num_topics_range={}'.format(num_topics_range))

    #Contextual string embeddings are powerful embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use.
    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings('news-forward')
    flair_embedding_backward = FlairEmbeddings('news-backward')

    bert_embedding = BertEmbeddings('bert-base-uncased')

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(df.index), 7168))  #.cuda()
    # fill tensor with embeddings

    #  for text in tqdm(df['resp_whytfa']):    #df['text_cl']):
    #from tqdm import tqdm - show smart progress meter
    i = 0
    for text in df['resp_whytfa']:
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

        if (i > 100):
            break

    print("before the PCA")

    #detach the tensor from the GPU and convert it to a NumPy array
    Y = X.cpu().detach().numpy()
    #del(X)
    #torch.cuda.empty_cache()

    #We want to cluster these vectors into topics, and we’ll invoke Agglomerative Clustering with Ward affinity from scikit-learn to do so.
    #Bottom-up hierarchical clustering algorithms have a memory complexity of O(n²), so we’ll use Principal Component Analysis to speed up this process.
    #As a side note, I did test a number of clustering algorithms (K-means, BIRCH, DBSCAN, Agglomerative with complete/average affinity), but Ward seems to perform the best in most cases

    #reduce the dimensionality of our vectors to length 768
    pca = IncrementalPCA(copy=False, n_components=768, batch_size=1000)
    #pca = PCA(n_components=768)
    X_red = pca.fit_transform(X)

    del (X)
    print("After the fit_transform")

    N_CLUSTERS = 5
    # WARD CLUSTER
    ward = AgglomerativeClustering(n_clusters=N_CLUSTERS,
                                   affinity='euclidean',
                                   linkage='ward')
    pred_ward = ward.fit_predict(X_red)
    print("After fit_predict")

    df['topic'] = pred_ward
    df.to_csv('bert_withtopic.csv')
    print("Write bert_withtopic.csv")

    #get topic composition
    topic_docs = []
    # group text into topic-documents
    for topic in range(N_CLUSTERS):
        topic_docs.append(' '.join(
            df[df['cluster'] == topic]['text_cl'].values))
    # apply function
    df_tfidf = get_top_words(topic_docs, 10)
    print(f"Top words: df_tfidf")

    #How good are our topics?
    #We find the centroids of the vectors by averaging them across each topic:
    topic_centroids = []
    for topic in tqdm(range(N_CLUSTERS)):
        X_topic = X_red[df.index[df['cluster'] == topic]]
        X_mean = np.mean(X_topic, axis=0)
        topic_centroids.append(X_mean)

    #calculate the euclidean distance of each Tweet vector to their respective topic centroid:
    topic_distances = []
    for row in tqdm(df.index):
        topic_centroid = topic_centroids[df.iloc[row]['cluster']]
        X_row = X_red[row]
        topic_distance = euclidean(topic_centroid, X_row)
        topic_distances.append(topic_distance)

    df['topic_distance'] = topic_distances
    #visualise the distribution of distances to the topic centroid
    #The closer the distribution to the left of the graph, the more compact the topic is
    df.to_csv('bert_withtopic_distance.csv')
    print('Write bert_withtopic_distance.csv')

    #topic similarity - how similar the topics are to each other
    #We will construct a euclidean distance matrix between the 10 topic centroids to find the distance between the topic averages
    df_dist_matrix = pd.DataFrame(distance_matrix(topic_centroids,
                                                  topic_centroids),
                                  index=range(N_CLUSTERS),
                                  columns=range(N_CLUSTERS))

    print(f"df_dist_matrix={df_dist_matrix}")
    with open('df_dist_matrix', 'w') as fout:
        fout.write(u'#' + '\t'.join(str(e)
                                    for e in df_dist_matrix.shape) + '\n')
        df_dist_matrix.tofile(fout)
Exemplo n.º 28
0
    def embedd_document(self, document: str) -> Tensor:
        flair_doc = Sentence(document)

        self.document_embedding.embed(flair_doc)
        return flair_doc.get_embedding().detach().numpy()
        explain_pred(str(to_process))
        label = input(
            "What is the ground truth label of this? Seperate labels with a space"
        )
        if label == "":
            pass
        elif label == "f":
            break
        elif label == "stop":
            csvfile.close()
            if keras and increment:
                pipe.named_steps['model'].model.save('keras_model.h5')
                pipe.named_steps['model'].model = None
                joblib.dump(pipe, 'saved_card_classification.pkl')
                print("Model Dumped!!!!")
            done = True
            sys.exit()
        else:
            the_labels = label.split()
            if increment == True:
                t_model = pipe.named_steps['model']
                ppset = Sentence(str(to_process))
                stacked_embeddings.embed(ppset)
                the_emb = ppset.get_embedding().cpu().detach().numpy().reshape(
                    1, -1)
                t_model.partial_fit(
                    the_emb, the_labels)  ##INCREMENTAL LEARNING MODE ENGAGED
            the_labels.append(str(to_process))
            spamwriter.writerow(the_labels)
            csvfile.flush()
Exemplo n.º 30
0
def compute_elmo_embedding(keyword):
    sentence = Sentence(keyword)
    document_embedding.embed(sentence)
    return sentence.get_embedding().detach().cpu().numpy()