示例#1
0
 def embed_sentence(sentence: str,
                    pooling_operation,
                    layers: str = '1',
                    use_scalar_mix: bool = False) -> Sentence:
     embeddings = XLNetEmbeddings(pretrained_model_name_or_path=xlnet_model,
                                  layers=layers,
                                  pooling_operation=pooling_operation,
                                  use_scalar_mix=use_scalar_mix)
     flair_sentence = Sentence(sentence)
     embeddings.embed(flair_sentence)
     return flair_sentence
示例#2
0
    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLNetEmbeddings(
            model=xlnet_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence
示例#3
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
示例#4
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                print(
                    f"Corresponding flair embedding module not found for {model_name_or_path}"
                )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    val_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    for text in final_val['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        val_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list, val_data_list
    def _get_stacked_embeddings(self) -> StackedEmbeddings:
        layers = ",".join(str(layer) for layer in self.experiment.layers)
        pooling_operation = self.experiment.pooling_operation

        token_embeddings = []

        for embedding in self.experiment.embeddings:
            if embedding.startswith("roberta"):
                token_embeddings.append(
                    RoBERTaEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif (embedding.startswith("bert")
                  or embedding.startswith("distilbert")
                  or embedding.startswith("spanbert")):
                token_embeddings.append(
                    BertEmbeddings(
                        bert_model_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("elmo"):
                model_name = embedding.split("-")[-1]
                token_embeddings.append(ELMoEmbeddings(model=model_name))
            elif embedding.startswith("gpt2"):
                token_embeddings.append(
                    OpenAIGPT2Embeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlm"):
                token_embeddings.append(
                    XLMEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlnet"):
                token_embeddings.append(
                    XLNetEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))

        return StackedEmbeddings(embeddings=token_embeddings)
示例#7
0
    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        model_name_or_path: str = "bert-base-cased",
    ) -> List[Sentence]:
        """ Produces embeddings for text

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        * **model_name_or_path** - The hosted model name key or model path
        **return** - A list of Flair's `Sentence`s
        """
        # Convert into sentences
        if isinstance(text, str):
            sentences = Sentence(text)
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        else:
            sentences = text

        # Load correct Embeddings module
        if not self.models[model_name_or_path]:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.models[model_name_or_path] = BertEmbeddings(
                    model_name_or_path)
            elif "roberta" in model_name_or_path:
                self.models[model_name_or_path] = RoBERTaEmbeddings(
                    model_name_or_path)
            elif "gpt2" in model_name_or_path:
                self.models[model_name_or_path] = OpenAIGPT2Embeddings(
                    model_name_or_path)
            elif "xlnet" in model_name_or_path:
                self.models[model_name_or_path] = XLNetEmbeddings(
                    model_name_or_path)
            elif "xlm" in model_name_or_path:
                self.models[model_name_or_path] = XLMEmbeddings(
                    model_name_or_path)
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.models[model_name_or_path] = FlairEmbeddings(
                    model_name_or_path)
            else:
                try:
                    self.models[model_name_or_path] = WordEmbeddings(
                        model_name_or_path)
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )
                return Sentence("")
        embedding = self.models[model_name_or_path]
        return embedding.embed(sentences)
示例#8
0
    def __init__(
        self,
        *embeddings: str,
        methods: List[str] = ["rnn", "pool"],
        configs: Dict = {
            "pool_configs": {
                "fine_tune_mode": "linear",
                "pooling": "mean"
            },
            "rnn_configs": {
                "hidden_size": 512,
                "rnn_layers": 1,
                "reproject_words": True,
                "reproject_words_dimension": 256,
                "bidirectional": False,
                "dropout": 0.5,
                "word_dropout": 0.0,
                "locked_dropout": 0.0,
                "rnn_type": "GRU",
                "fine_tune": True,
            },
        },
    ):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Check methods
        for m in methods:
            assert m in self.__class__.__allowed_methods

        # Set configs for pooling and rnn parameters
        for k, v in configs.items():
            assert k in self.__class__.__allowed_configs
            setattr(self, k, v)

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        if "pool" in methods:
            self.pool_embeddings = DocumentPoolEmbeddings(
                self.embedding_stack, **self.pool_configs)
            print("Pooled embedding loaded")
        if "rnn" in methods:
            self.rnn_embeddings = DocumentRNNEmbeddings(
                self.embedding_stack, **self.rnn_configs)
            print("RNN embeddings loaded")
示例#9
0
def get_xlnet(model_name):
    return XLNetEmbeddings(model_name)
示例#10
0
    return {token.text: token.embedding.numpy() for token in sentence}


if __name__ == "__main__":

    from test_textsim import *
    from flair.embeddings import XLMRobertaEmbeddings, BertEmbeddings, XLNetEmbeddings, XLMEmbeddings, RoBERTaEmbeddings

    measures = {}

    SAME, DIFF = load_data("./data/test_STS2017en-en.txt")

    MODELS = {
        "xlmr": XLMRobertaEmbeddings(),
        "bert": BertEmbeddings(),
        "xlnet": XLNetEmbeddings(),
        "xlm": XLMEmbeddings(),
        "roberta": RoBERTaEmbeddings(),
    }

    for model in MODELS:

        print(model)

        results = run_experiment(SAME,
                                 DIFF,
                                 lambda x: flair_embed_dict(x, MODELS[model]),
                                 wmdistance,
                                 inverse=True)
        measures['{}-wmdist'.format(model)] = results
        print(score(results[0], results[1]))
示例#11
0
        'BertLS': [
            BertEmbeddings(bert_model_or_path='bert-large-uncased',
                           layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                           "15,16,17,18,19,20,21,22,23,24",
                           use_scalar_mix=True)
        ],
        "RoBERTa": [RoBERTaEmbeddings('roberta-base')],
        "RoBERTaL": [RoBERTaEmbeddings('roberta-large')],
        "RoBERTaLS": [
            RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-large",
                              layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                              "15,16,17,18,19,20,21,22,23,24",
                              use_scalar_mix=True)
        ],
        "XLNet":
        [XLNetEmbeddings(pretrained_model_name_or_path="xlnet-large-cased")],
        "XLNetS": [
            XLNetEmbeddings(pretrained_model_name_or_path="xlnet-large-cased",
                            layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                            "15,16,17,18,19,20,21,22,23,24",
                            use_scalar_mix=True)
        ],
        "DistilBert": [BertEmbeddings('distilbert-base-uncased')]
    }

    # Choose the tested embeddings
    word_embeddings = model_selector[model_type]

    # create an RNN
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=256,
示例#12
0
if ARGS.restore:
    ensemble_tagger = EnsembleTagger.load(model_path + "final-model.pt")
else:
    elmo_tagger = SequenceTagger(hidden_size=256,
                                 embeddings=ELMoEmbeddings('small'),
                                 tag_dictionary=tag_dictionary,
                                 tag_type=tag_type,
                                 use_crf=True)
    bert_tagger = SequenceTagger(hidden_size=256,
                                 embeddings=BertEmbeddings(),
                                 tag_dictionary=tag_dictionary,
                                 tag_type=tag_type,
                                 use_crf=True)
    xlnet_tagger = SequenceTagger(hidden_size=256,
                                  embeddings=XLNetEmbeddings(),
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type,
                                  use_crf=True)
    flair_tagger = SequenceTagger(hidden_size=256,
                                  embeddings=StackedEmbeddings([
                                      FlairEmbeddings('news-forward'),
                                      FlairEmbeddings('news-backward')
                                  ]),
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type,
                                  use_crf=True)
    models = []
    if ARGS.model == "be":
        models = [bert_tagger, elmo_tagger]
    elif ARGS.model == "bf":
示例#13
0
    END = '\033[0m'


granularity_level = "Sent"  #"Word" "Sent" "Paragraph"
dynamic = False  ##Controls if we highlight the more important words more or not
graph = False  ###ONLY WORKS WITH "Word" granularity_level
word_doc = True
html = True
word_window_size = 10  ##This is really doubled since it's Bi-directional. Only matters for word level granularity
highlight_color_intensity = 175  # try values between 25 and 200
doc_embeddings = []
scores = []

stacked_embeddings = DocumentPoolEmbeddings([
    #WordEmbeddings('en'),
    XLNetEmbeddings('xlnet-large-cased', use_scalar_mix='True')
    #WordEmbeddings('glove'),
    #WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
    #BertEmbeddings('bert-base-cased'),
    #FlairEmbeddings('news-forward-fast'),
    #FlairEmbeddings('news-backward-fast'),
    #OpenAIGPTEmbeddings()
    #TransformerXLEmbeddings()
])  #, mode='max')


def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--card_file_path",
                        "-c",
                        type=str,
示例#14
0
nlp = StanfordCoreNLP('http://localhost:9000')
properties = {'annotators': 'ssplit', 'outputFormat': 'json'}

#Getting user arguments
mode, number, _set, load, iteration, cuda_option, save_path, log_file, architecture, embedding_type, loss_mode, learning_rate, score_mode, max_pool, args = parse_arguments(
)

#define the embeddings
if embedding_type == 1:
    selected_embedding = BertEmbeddings()
    embed_dim = 3072
elif embedding_type == 2:
    selected_embedding = FlairEmbeddings("news-forward")
    embed_dim = 2048
elif embedding_type == 3:
    selected_embedding = XLNetEmbeddings()
    embed_dim = 2048

bert = BertEmbeddings()
flair = FlairEmbeddings("news-forward")

# Load the pretrained model of resnet
resnet = models.resnet101(pretrained=True)
# Use the model object to select the desired layer
modules = list(resnet.children())[:-1]

resnet = nn.Sequential(*modules)
resnet.eval()

#define the transformers for the picture
scaler = transforms.Scale((224, 224))