예제 #1
0
def generate_topics_on_series(series):
    """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f

    Returns:
        [type]: [description]
    """
    validate_text(series)

    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings("news-forward")
    flair_embedding_backward = FlairEmbeddings("news-backward")
    bert_embedding = BertEmbeddings("bert-base-uncased")

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(series.index), 7168)).cuda()

    # fill tensor with embeddings
    i = 0
    for text in tqdm(series):
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

    X = X.cpu().detach().numpy()
    torch.cuda.empty_cache()

    return X
예제 #2
0
class DefaultFeaturizerForMultiLabelRank(ObservationFeaturizer):
    def __init__(self,
                 action_space: ActionSpace,
                 embedding_type: str = "fasttext",
                 pre_process: bool = False,
                 device: str = "cpu"):
        self.device = device
        self.pre_process = pre_process
        self.text_pre_processor = TextPreProcessor(language="english")
        self._setup_device()
        embeddings = EmbeddingRegistry.get_embedding(embedding_type)
        self.doc_embeddings = DocumentPoolEmbeddings(embeddings).to(
            torch.device(self.device))
        self.action_space = action_space
        self._current_input_embeddings = None

    def _setup_device(self):
        flair.device = torch.device(self.device)

    def init_on_reset(self, input_text: Union[List[str], str]):
        # pooled document embeddings
        text = self.text_pre_processor.process(
            input_text) if self.pre_process else input_text
        sent = Sentence(text)
        self.doc_embeddings.embed(sent)
        self._current_input_embeddings = torch.tensor(
            sent.embedding.cpu().detach().numpy())

    def featurize(self, observation: Observation) -> torch.Tensor:
        input_vector = self._current_input_embeddings
        context_vector = self._featurize_context(
            observation.get_current_action_history())
        concatenated = torch.cat((input_vector, context_vector), dim=0)
        return concatenated

    def get_observation_dim(self) -> int:
        return self._get_input_dim() + self._get_context_dim()

    def _featurize_input(self, input_index: int) -> torch.Tensor:
        # the input does not change on each step
        return self._current_input_embeddings

    def _featurize_context(self, context: List[str]) -> torch.Tensor:
        # bag of actions representation
        context_vector = torch.zeros(self.action_space.size())
        action_indices = [
            self.action_space.action_to_ix(action) for action in context
        ]
        context_vector[action_indices] = 1.0
        return context_vector

    def _get_input_dim(self):
        sent = Sentence("A random text to get the embedding dimension")
        self.doc_embeddings.embed(sent)
        dim = sent[0].embedding.shape[0]
        sent.clear_embeddings()
        return dim

    def _get_context_dim(self):
        return self.action_space.size()
예제 #3
0
    def get_embeddings(self, sentence):

        # document_embeddings = DocumentPoolEmbeddings(
        #    [self.glove_embedding,  # initialize the document embeddings, mode = mean
        #     self.flair_embedding_backward,
        #     self.flair_embedding_forward])

        # Glove + BPE
        document_embeddings = DocumentPoolEmbeddings(
            [self.glove_embedding, self.bpe_embedding])

        # Nilc fasttext 600 emdedding
        #document_embeddings = DocumentPoolEmbeddings(
        #            [self.fast_text_embedding])

        # Flair
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.flair_embedding_forward])

        # ElMO
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.elmo_embedding])

        # create an example sentence
        sentence = Sentence(sentence)

        # embed the sentence with our document embedding
        document_embeddings.embed(sentence)

        # now check out the embedded sentence.
        return sentence.get_embedding()
예제 #4
0
class FlairEncoder(BaseTextEncoder):
    is_trained = True

    def __init__(self,
                 word_embedding: str = 'glove',
                 flair_embeddings: Tuple[str] = ('news-forward',
                                                 'news-backward'),
                 pooling_strategy: str = 'mean',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.word_embedding = word_embedding
        self.flair_embeddings = flair_embeddings
        self.pooling_strategy = pooling_strategy

    def post_init(self):
        from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, FlairEmbeddings
        self._flair = DocumentPoolEmbeddings([
            WordEmbeddings(self.word_embedding),
            FlairEmbeddings(self.flair_embeddings[0]),
            FlairEmbeddings(self.flair_embeddings[1])
        ],
                                             pooling=self.pooling_strategy)

    @batching
    @as_numpy_array
    def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
        from flair.data import Sentence
        import torch
        # tokenize text
        batch_tokens = [Sentence(v) for v in text]
        self._flair.embed(batch_tokens)
        return torch.stack([v.embedding for v in batch_tokens]).detach()
예제 #5
0
class Embedding:
    def __init__(self, filepath: Union[Path, str]):
        self._bert = BertEmbeddings(filepath)
        self.document_embeddings = DocumentPoolEmbeddings([self._bert])

    def get_vector(self, text: str) -> np.ndarray:
        sentence = Sentence(text, use_tokenizer=False)
        self.document_embeddings.embed(sentence)
        with torch.no_grad():
            vector = sentence.get_embedding()
            return vector.numpy()

    def process_reference_sentences(self, sentences: Dict[str, List[str]]):
        for name, sentence in sentences.items():
            try:
                sentence = " ".join(sentence)
                yield name, self.get_vector(sentence)
            except RuntimeError:
                logging.error("Oops! Reference sentence too long...")

    def process_batch(self, documents: Documents):
        for name, sentence in self.build_sentences(documents):
            logging.info(f"Processing {name}...")
            try:
                yield name, self.get_vector(sentence)
            except RuntimeError:
                logging.error("Oops! Sentence too long..")

    def build_sentences(self, documents: Documents):
        for name, sentences in documents.items():
            for index, sentence in sentences.items():
                yield f"{name}-{index}", " ".join(sentence)
예제 #6
0
class EmbeddingSimilarityTransformer(CustomTransformer):
    _modules_needed_by_name = [
        'regex==2018.1.10', 'flair==0.4.1', 'segtok==1.5.7'
    ]
    _is_reproducible = False
    _can_use_gpu = True
    _repl_val = 0

    def __init__(self, embedding_name, **kwargs):
        super().__init__(**kwargs)
        self.embedding_name = embedding_name

    @staticmethod
    def get_default_properties():
        return dict(col_type="text",
                    min_cols=2,
                    max_cols=2,
                    relative_importance=1)

    @staticmethod
    def get_parameter_choices():
        return {"embedding_name": ["glove", "en", "bert"]}

    @property
    def display_name(self):
        name_map = {"glove": "Glove", "en": "FastText", "bert": "BERT"}
        return "%sEmbedding_CosineSimilarity" % name_map[self.embedding_name]

    def fit_transform(self, X: dt.Frame, y: np.array = None):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        return self.transform(X)

    def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1,
                                                                        -1))[0,
                                                                             0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output)
예제 #7
0
def test_document_pool_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    for mode in [u'mean', u'max', u'min']:
        embeddings = DocumentPoolEmbeddings([glove, charlm], mode=mode)
        embeddings.embed(sentence)
        assert (len(sentence.get_embedding()) == 1124)
        sentence.clear_embeddings()
        assert (len(sentence.get_embedding()) == 0)
예제 #8
0
def test_document_pool_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    for mode in ['mean', 'max', 'min']:
        embeddings = DocumentPoolEmbeddings(
            [glove, charlm], pooling=mode, fine_tune_mode='none')
        embeddings.embed(sentence)
        assert (len(sentence.get_embedding()) == 1074)
        sentence.clear_embeddings()
        assert (len(sentence.get_embedding()) == 0)
예제 #9
0
def create_embeddings_flair(data: pd.DataFrame,
                            column: str = "text",
                            path: str = None,
                            embeddings_type: str = "tranformer",
                            typs: str = "train"):
    assert column in data.columns.tolist(
    ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns"
    assert embeddings_type in ["tranformer", "stacked"]

    from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
    from flair.data import Sentence

    fast_text_embedding = WordEmbeddings('de')
    flair_embedding_forward = FlairEmbeddings('de-forward')
    flair_embedding_backward = FlairEmbeddings('de-backward')

    stacked_embeddings = DocumentPoolEmbeddings([
        fast_text_embedding, flair_embedding_forward, flair_embedding_backward
    ])

    transformer_embedding = TransformerDocumentEmbeddings(
        'bert-base-german-cased', fine_tune=False)

    tic = time.time()

    embeddings = []

    for i, text in enumerate(data[column].values):
        print("sentence {}/{}".format(i, len(data)))
        sentence = Sentence(text)

        if embeddings_type == "stacked":
            stacked_embeddings.embed(sentence)
        elif embeddings_type == "tranformer":
            transformer_embedding.embed(sentence)

        embedding = sentence.embedding.detach().cpu().numpy()
        embeddings.append(embedding)

    embeddings = np.array(embeddings)

    columns = [
        "embedding_{}".format(feature)
        for feature in range(embeddings.shape[1])
    ]

    csv = pd.DataFrame(embeddings, columns=columns)
    csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False)

    toc = time.time()

    print(
        "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s"
        .format(embeddings_type, typs, toc - tic))
예제 #10
0
def get_sentence_embeddings(texts):
    word_embeddings = [BertEmbeddings('bert-large-uncased')]
    document_embeddings = DocumentPoolEmbeddings(word_embeddings)

    sentences = [Sentence(text) for text in texts]
    embeddings = []

    for sentence in sentences:
        document_embeddings.embed(sentence)
        sentence_embedding = sentence.get_embedding().numpy().reshape(-1)
        embeddings.append(sentence_embedding)

    return np.array(embeddings)
def vectorize(string: str = None, selected_base_models: list = None):
    # 'vectorizes' the input string using one or a combination of word embeddings - if 'vector representation'
    # is being selected at Algorithms construction time.
    """
    :param string, input string
    :param selected_base_models list of the models we want to use in order to create word embeddings
    :return: embedding
    """

    if not selected_base_models:
        raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> Provide at least one base model: ['bert',"
                         f"'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']")

    embeddings = []

    if 'bert' in selected_base_models:
        embeddings.append(bert_embedding)
    if 'roberta' in selected_base_models:
        embeddings.append(roberta_embedding)
    if 'glove' in selected_base_models:
        embeddings.append(glove_embedding)
    if 'character' in selected_base_models:
        embeddings.append(character_embeddings)
    if 'flair_forward' in selected_base_models:
        embeddings.append(flair_forward)
    if 'flair_backward' in selected_base_models:
        embeddings.append(flair_backward)

    # if none of the above, then the model combination passed is not supported
    if not embeddings:
        raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> {selected_base_models} not available. "
                         f"Supported models: "
                         f"['bert', 'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']")

    # we are making use of Flair's API, DocumentPoolEmbeddings takes a list of embeddings to be combined
    # for a string containing whitespaces, it creates word embeddings for each word and then combines them. 'pooling'
    # parameter controls the way they are combined - We've chosen to combine them by taking their mean value
    stacked_embeddings = DocumentPoolEmbeddings(embeddings=embeddings,
                                                fine_tune_mode='none',
                                                pooling='mean')

    sentence = Sentence(string)

    # combination by concatenating each models' output (stackin) - thanks Flair API :)
    stacked_embeddings.embed(sentence)

    # detach() because if fine_tune_mode is set to, for exapme: 'linear', or 'non-linear', apart from the embedding
    # itself, it also returns the computed gradients of each layer (Pytorch's Autograd module - It records all the
    # perations that we are performing and replays it backward to compute gradients.)
    # we could tweak fine_tune_mode if wanted to fine tune the bese model further by training on external data sets
    return sentence.embedding.detach().numpy()
def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list
예제 #13
0
def is_difference_large(text1, text2):
    text1_preproccessed = Sentence(text1)
    text2_preproccessed = Sentence(text2)

    glove_embedding = WordEmbeddings('glove')
    document_embedding = DocumentPoolEmbeddings([glove_embedding])
    document_embedding.embed(text1_preproccessed)
    document_embedding.embed(text2_preproccessed)

    text1_embedding = text1_preproccessed.get_embedding()
    text2_embedding = text2_preproccessed.get_embedding()
    text1_embedding = np.reshape(text1_embedding, (-1, 1))
    text2_embedding = np.reshape(text2_embedding, (-1, 1))
    similarity = cosine_similarity(text1_embedding, text2_embedding)
    print(np.mean(similarity))
예제 #14
0
class FlairGlove100Embed(BaseEmbed):
    def __init__(self, n_dims=100, make_unit_length=True):
        from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
        super().__init__(n_dims=n_dims, make_unit_length=make_unit_length)
        embeddings = [WordEmbeddings('glove')]
        self.embeddings = DocumentPoolEmbeddings(embeddings, fine_tune_mode='none')
        self.log = getLogger(type(self).__name__)

    def get_sentence_vector(self, text):
        from flair.embeddings import Sentence
        sentence = Sentence(clean_text(text))
        _ = self.embeddings.embed(sentence)
        a = sentence.get_embedding()
        result = a.detach().cpu().numpy()
        if np.sum(result[0:5]) == 0:
            result = np.random.randn(self.n_dims)
        return result

    def fit(self, feature: Feature, **kwargs):
        super().fit(feature, **kwargs)
        self.log.debug("End Fitting FlairEmbedding")
        return

    def transform(self, feature: Feature, **kwargs) -> np.ndarray:
        self.log.debug("Start Transform TextEmbedding...")
        outputs = np.vstack([np.array([self.get_sentence_vector(i) for i in t]).mean(0) if is_1d_array(
            t) else self.get_sentence_vector(t) for t in tqdm(feature)])
        outputs = unit_length(outputs, axis=1) if self.make_unit_length else outputs
        self.log.debug("End Transform TextEmbedding")
        return self.check_output_dims(outputs, feature)
예제 #15
0
def createFlairEmbeddings(embedding_list, data):

    embeddings = []

    sentences = data['Interest_Name'].values

    model = DocumentPoolEmbeddings(embedding_list, fine_tune_mode='nonlinear')
    if __name__ == "__main__":
        for sent in sentences:
            sentence = Sentence(sent)
            model.embed(sentence)
            modeled_embedding = sentence.get_embedding()
            array = modeled_embedding.cpu().detach().numpy()
            embeddings.append(array)

    return embeddings
예제 #16
0
class FlairGlove100AndBytePairEmbedding(ContentEmbeddingBase):
    def __init__(self, make_unit_length=True):
        super().__init__(n_dims=200, make_unit_length=make_unit_length)
        embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')]
        self.embeddings = DocumentPoolEmbeddings(embeddings)
        self.log = getLogger(type(self).__name__)

    # noinspection PyUnresolvedReferences
    def get_sentence_vector(self, text):
        sentence = Sentence(clean_text(text))
        _ = self.embeddings.embed(sentence)
        a = sentence.get_embedding()
        result = a.cpu().detach().numpy()
        if np.sum(result[0:5]) == 0:
            result = np.random.randn(self.n_dims)
        return result

    def fit(self, feature: Feature, **kwargs):
        super().fit(feature, **kwargs)
        assert feature.feature_type == FeatureType.STR
        self.log.debug("End Fitting FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name)
        return

    def transform(self, feature: Feature, **kwargs) -> np.ndarray:
        self.log.debug("Start Transform FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name)
        assert feature.feature_type == FeatureType.STR
        outputs = np.vstack([self.get_sentence_vector(t) for t in tqdm(feature.values)])
        outputs = unit_length(outputs, axis=1) if self.make_unit_length else outputs
        self.log.debug("End Transform FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name)
        return self.check_output_dims(outputs, feature)
예제 #17
0
def embed_tweet(tweetList):
    # initialize the word embeddings
    tr_embedding = WordEmbeddings('tr')
    char_embedding = CharacterEmbeddings()

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(
        [tr_embedding, char_embedding])

    tweetTensors = []
    for tweet in tweetList:
        #print(norm_tweet(tweet))
        sentence = Sentence(norm_tweet(tweet))
        document_embeddings.embed(sentence)
        tweetTensors.append(sentence.get_embedding().data)
    return tweetTensors
예제 #18
0
def flair_embeddings(x, *args):
    from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings, Sentence

    word_embedders, aggregating_strategy, aggregating_params = args[0], args[
        1], args[2]
    embedding = None
    if aggregating_strategy == 'pooling':
        # TODO: check if kwargs work
        embedding = DocumentPoolEmbeddings(word_embedders,
                                           **aggregating_params)
    if aggregating_strategy == 'rnn':
        # TODO: check if kwargs work
        embedding = DocumentRNNEmbeddings(word_embedders, **aggregating_params)
    if embedding is None:
        raise KeyError("Insufficient vespine gas")
    sentence = Sentence(x)
    embedding.embed(sentence)
    return sentence.embedding.detach().numpy().reshape(-1, 1)
예제 #19
0
def compute_embedding(embedding, remove_punctuation: bool, file_name: str):
    """
    Computes the embedding with given model for all arguments
    :param embedding: Model
    :param remove_punctuation: Bool to indicate if punctuation should be removed
    :param file_name:
    """
    arguments = Arguments()
    document_embedding = DocumentPoolEmbeddings([embedding])

    embedded_arguments = {}

    for argument in arguments.ground_truth_arguments:
        premises = argument['premises']
        conclusion = argument['conclusion']

        conclusion_text = conclusion['conclusion_text']
        if remove_punctuation:
            conclusion_text = remove_punctuations(conclusion_text)
        conclusion_sentence = Sentence(conclusion_text)
        document_embedding.embed(conclusion_sentence)
        embedded_conclusion = conclusion_sentence.get_embedding().detach(
        ).numpy().tolist()

        embedded_premises = {}
        argument_uid = None

        for premise in premises:
            premise_text = premise[1]
            if remove_punctuation:
                premise_text = remove_punctuations(premise_text)
            premise_sentence = Sentence(premise_text)
            document_embedding.embed(premise_sentence)
            embedded_premise = premise_sentence.get_embedding().detach().numpy(
            ).tolist()
            embedded_premises[premise[2]] = embedded_premise
            argument_uid = premise[0]
        embedded_arguments[argument_uid] = [
            embedded_conclusion, embedded_premises
        ]

        save_embedding(embedded_arguments, file_name)
예제 #20
0
def extract_bert_features(json_file,dataroot_folder,choice="yes_no",split="train"):
    questions=json.load(open(json_file))['questions']

    question_ids=[quest['question_id'] for quest in questions]
    #questions=questions[0:10]
    bert=BertEmbeddings('bert-base-uncased')
    doc_bert=DocumentPoolEmbeddings([bert])
    bert_embed_matrix=np.zeros((len(questions),3072))
    print('Extracting bert features')
    
    for index,quest in tqdm(enumerate(questions)):
        sentence=Sentence(quest['question'])
        doc_bert.embed(sentence)
        bert_embed_matrix[index]=sentence.embedding.numpy()
    
    hdf5_file_path=os.path.join(dataroot_folder,split+'_bert_'+choice+'.hdf5')
    h5f = h5py.File(hdf5_file_path, 'w')
    h5f.create_dataset('bert_embeddings', data=bert_embed_matrix)
    h5f.create_dataset('question_ids', data=question_ids)
    h5f.close()
예제 #21
0
def flair_embed_docPool(sentence: str) -> Vector :
    """ Embed words with Flair's WordEmbeddings and DocumentPoolEmbeddings (for multi-words)"""

    from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, FlairEmbeddings
    from flair.data import Sentence
    
    # init the word embeddings
    flair_embedding_forward = FlairEmbeddings('news-forward')

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])

    # create an example sentence
    sentence = Sentence(sentence)

    # embed the sentence with our document embedding
    document_embeddings.embed(sentence)

    # now check out the embedded sentence.
    return sentence.get_embedding()
예제 #22
0
def is_difference_large(text1: str, text2: str) -> bool:
    text1_preprocessed = Sentence(text1)
    text2_preprocessed = Sentence(text2)

    glove_embedding = WordEmbeddings('glove')
    document_embedding = DocumentPoolEmbeddings([glove_embedding])
    document_embedding.embed(text1_preprocessed)
    document_embedding.embed(text2_preprocessed)

    text1_embedding = text1_preprocessed.get_embedding()
    text2_embedding = text2_preprocessed.get_embedding()
    text1_embedding = np.reshape(text1_embedding, (-1, 1))
    text2_embedding = np.reshape(text2_embedding, (-1, 1))
    similarity = cosine_similarity(text1_embedding, text2_embedding)
    # Some example treshold
    # TODO: Determine a good threshold for example in pitch
    if np.mean(similarity) > 0.06:
        return False
    else:
        return True
예제 #23
0
def cosine_embedding(sentence1, sentence2, model):
    embeddings = DocumentPoolEmbeddings(model, mode='mean')
    s1 = Sentence(sentence1)
    s2 = Sentence(sentence2)
    e1 = embeddings.embed(s1)
    e2 = embeddings.embed(s2)
    v1 = s1.get_embedding()
    v2 = s2.get_embedding()
    #print(v1, v2)   #check that you don't get empty tensors

    cos_sim = dot(v1, v2) / (norm(v1) * norm(v2))
    #print(cos_sim)
    return cos_sim  #lies in [-1, 1].
예제 #24
0
class InformedFeaturizer(ObservationFeaturizer):
    def __init__(self, device: str = "cpu"):
        self.device = device
        self._setup_device()
        self.doc_embeddings = DocumentPoolEmbeddings([WordEmbeddings("en")])

    def init_on_reset(self, question: str, facts: List[str]):
        pass

    def featurize(self, observation: Observation) -> torch.Tensor:
        sim_scores = [self._get_sim(observation.get_question(), observation.get_choice()),
                      self._get_sim(".".join(observation.get_facts()), observation.get_choice())]
        sim_scores = torch.tensor(sim_scores)
        return sim_scores

    def get_observation_dim(self):
        return 2

    def _get_sentence_embedding(self, text: str) -> torch.Tensor:
        text = "..." if len(text) == 0 else text
        sent = Sentence(text)
        self.doc_embeddings.embed(sent)
        if len(sent) > 1:
            embedding = torch.tensor(sent.embedding.cpu().numpy()).reshape(1, -1)
        else:
            embedding = torch.tensor(sent[0].embedding.cpu().numpy()).reshape(1, -1)
        return embedding

    def _setup_device(self):
        import flair, torch
        flair.device = torch.device(self.device)

    def _get_sim(self, query: str, choice_text: str):
        sim = torch.nn.CosineSimilarity(dim=1)(self._get_sentence_embedding(query),
                                               self._get_sentence_embedding(choice_text))
        return sim
예제 #25
0
    pooling='mean',
)
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

poss_sections = {
    '#gender': ['gender', 'sex', 'gentleman'],
    '#male': ['male', 'man', 'men'],
    '#female': ['female', 'woman', 'women'],
}

# substitute term by its embedding
for list_candidates in tqdm(poss_sections.values(),
                            desc='Embedding search terms'):
    for i in range(len(list_candidates)):
        sentence = Sentence(list_candidates[i].lower())
        flair_emb.embed(sentence)
        list_candidates[i] = sentence.embedding
        sentence.clear_embeddings()

dataset = []
print('Retrieving documents from database...')
documents = Database.list_raw_documents()


def create_emb(doc):
    batch_size = 60

    dataset_section = []
    for section_title, section_text in doc['raw']['sections'].items():
        text = section_text.strip()
        # tokenize each sentence
예제 #26
0
 def _embed_document(self, document_text: str,
                     doc_embeddings: DocumentPoolEmbeddings):
     sentence = Sentence(document_text)
     doc_embeddings.embed(sentence)
     return sentence.get_embedding().data.cpu().numpy()
예제 #27
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, OneHotEmbeddings, \
 DocumentRNNEmbeddings

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')
# embeddings = OneHotEmbeddings(corpus)

glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings(
    [glove_embedding],
    # flair_embedding_backward, flair_embedding_forward],
    # pooling='min',
    fine_tune_mode='nonlinear')
document_embeddings = DocumentRNNEmbeddings([glove_embedding])

document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding],
                                                 rnn_type='LSTM')

# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())
예제 #28
0
class EasyDocumentEmbeddings:
    """ Document Embeddings generated by pool and rnn methods applied to the word embeddings of text

    Usage:

    ```python
    >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"])
    ```

    **Parameters:**

    * ***embeddings** - Non-keyword variable number of strings referring to model names or paths
    * **methods** - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one)
    * **configs** - A dictionary of configurations for flair's rnn and pool document embeddings
    ```python
    >>>example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", },
    ...                   "rnn_configs": {"hidden_size": 512,
    ...                                   "rnn_layers": 1,
    ...                                   "reproject_words": True,
    ...                                   "reproject_words_dimension": 256,
    ...                                   "bidirectional": False,
    ...                                   "dropout": 0.5,
    ...                                   "word_dropout": 0.0,
    ...                                   "locked_dropout": 0.0,
    ...                                   "rnn_type": "GRU",
    ...                                   "fine_tune": True, },
    ...                  }
    ```
    """

    __allowed_methods = ["rnn", "pool"]
    __allowed_configs = ("pool_configs", "rnn_configs")

    def __init__(
        self,
        *embeddings: str,
        methods: List[str] = ["rnn", "pool"],
        configs: Dict = {
            "pool_configs": {
                "fine_tune_mode": "linear",
                "pooling": "mean"
            },
            "rnn_configs": {
                "hidden_size": 512,
                "rnn_layers": 1,
                "reproject_words": True,
                "reproject_words_dimension": 256,
                "bidirectional": False,
                "dropout": 0.5,
                "word_dropout": 0.0,
                "locked_dropout": 0.0,
                "rnn_type": "GRU",
                "fine_tune": True,
            },
        },
    ):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Check methods
        for m in methods:
            assert m in self.__class__.__allowed_methods

        # Set configs for pooling and rnn parameters
        for k, v in configs.items():
            assert k in self.__class__.__allowed_configs
            setattr(self, k, v)

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        if "pool" in methods:
            self.pool_embeddings = DocumentPoolEmbeddings(
                self.embedding_stack, **self.pool_configs)
            print("Pooled embedding loaded")
        if "rnn" in methods:
            self.rnn_embeddings = DocumentRNNEmbeddings(
                self.embedding_stack, **self.rnn_configs)
            print("RNN embeddings loaded")

    def embed_pool(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """ Stacked embeddings


        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        **return** - A list of Flair's `Sentence`s
        """
        if isinstance(text, str):
            sentences = [Sentence(text)]
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        elif isinstance(text, Sentence):
            sentences = [text]
        else:
            sentences = text
        self.pool_embeddings.embed(sentences)
        return sentences

    def embed_rnn(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """ Stacked embeddings

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        **return** - A list of Flair's `Sentence`s
        """
        if isinstance(text, str):
            sentences = [Sentence(text)]
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        elif isinstance(text, Sentence):
            sentences = [text]
        else:
            sentences = text
        self.rnn_embeddings.embed(sentences)
        return sentences
예제 #29
0
def main():
    print("Instantiate embeddings")
    embeddings = DocumentPoolEmbeddings([
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward"),
        ELMoEmbeddings("pubmed"),
        BertEmbeddings("bert-large-uncased"),
    ])

    print("Load pubmed_data")
    pubmed_data = pd.concat([
        pd.read_json(f"data/medline/{medline_file}") for medline_file in
        ["medline_2016.json", "medline_2017.json", "medline_2018.json"]
    ])

    print("pubmed_corpus")
    pubmed_corpus = [
        try_sentence(text) for text in
        pubmed_data.title.apply(preproc).head(10_000)
    ]
    pubmed_corpus = [
        text for text in pubmed_corpus if text
    ]
    print(pubmed_corpus[0:5])

    print("query")
    query = [
        Sentence(text) for text in
        [
            "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization.",
            "Prioritizing putative influential genes in cardiovascular disease susceptibility by applying tissue-specific Mendelian randomization.",
            "Longitudinal analysis strategies for modelling epigenetic trajectories",
            "FATHMM-XF: accurate prediction of pathogenic point mutations via extended features",
            "PhenoSpD: an integrated toolkit for phenotypic correlation estimation and multiple testing correction using GWAS summary statistics.",
            "LD Hub: a centralized database and web interface to perform LD score regression that maximizes the potential of summary level GWAS data for SNP heritability and genetic correlation analysis.",
            "MELODI: Mining Enriched Literature Objects to Derive Intermediates",
            "The MR-Base platform supports systematic causal inference across the human phenome",
        ]
    ]

    print("Embed")
    for text in query + pubmed_corpus:
        embeddings.embed(text)

    print("Calculate scores")
    cos = torch.nn.CosineSimilarity(dim=0, eps=1e-5)
    cos_scores = []
    for query_id, query_text in enumerate(query):
        cos_res = [
            {
                "query_id": query_id,
                "target_id": target_id,
                "score": cos(query_text.embedding,
                             target_text.embedding).item()
            }
            for target_id, target_text in enumerate(pubmed_corpus)
        ]
        cos_scores.append(cos_res)

    cos_scores = pd.concat(pd.DataFrame(x) for x in cos_scores)
    print(cos_scores)

    n = 5
    for query_id, query_text in enumerate(query):
        print(f"# Query {query_id}")
        print(query_text)

        top_n = (
            cos_scores
              .query(f"query_id == {query_id}")
              .sort_values("score", ascending=False)
              .head(n)
        )
        for target_id, target_score in zip(top_n.target_id, top_n.score):
            print(f"  ## Candidate {target_id}, score {target_score}")
            print(f"  {pubmed_corpus[target_id]}\n")
        print("\n\n")
예제 #30
0
class EmbeddingSimilarityTransformer(CustomTransformer):
    _modules_needed_by_name = [
        "gensim==3.8.0", 'regex==2019.12.17', 'flair==0.4.1', 'segtok==1.5.7'
    ]
    _is_reproducible = False
    _can_use_gpu = True
    _repl_val = 0
    _testing_can_skip_failure = False  # ensure tested as if shouldn't fail

    def __init__(self, embedding_name, **kwargs):
        super().__init__(**kwargs)
        self.embedding_name = embedding_name

    @staticmethod
    def is_enabled():
        return False  # sometimes package flair has issues installing

    @staticmethod
    def can_use(accuracy, interpretability, **kwargs):
        """Uses all GPU memory - can lead to OOM failures in combination with other GPU-based transformers"""
        return False

    @staticmethod
    def get_default_properties():
        return dict(col_type="text",
                    min_cols=2,
                    max_cols=2,
                    relative_importance=1)

    @staticmethod
    def get_parameter_choices():
        return {"embedding_name": ["glove", "en", "bert"]}

    @property
    def display_name(self):
        name_map = {"glove": "Glove", "en": "FastText", "bert": "BERT"}
        return "%sEmbedding_CosineSimilarity" % name_map[self.embedding_name]

    def fit_transform(self, X: dt.Frame, y: np.array = None):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        return self.transform(X)

    def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1,
                                                                        -1))[0,
                                                                             0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output)