Пример #1
0
    def initialize_embeddings(self, fastbert=True, stackedembeddings=True):

        # Consider using pooling_operation="first", use_scalar_mix=True for the parameters

        # initialize individual embeddings
        if fastbert:
            bert_embedding = BertEmbeddings('distilbert-base-uncased',
                                            layers='-1')

        else:
            bert_embedding = BertEmbeddings('bert-base-cased', layers='-1')

        if stackedembeddings:
            glove_embedding = WordEmbeddings('glove')

            # init Flair forward and backwards embeddings
            flair_embedding_forward = FlairEmbeddings('news-forward')
            flair_embedding_backward = FlairEmbeddings('news-backward')

            embedding_types = [
                bert_embedding, glove_embedding, flair_embedding_forward,
                flair_embedding_backward
            ]

            embeddings = StackedEmbeddings(embeddings=embedding_types)

        else:

            embeddings = bert_embedding

        return embeddings
Пример #2
0
def out_embedding(type_, model, n_layers, stacked=False):
    '''
	Create object of embedding type for later purpose
	:param:
		:type_: (str) type of embedding (currently there are only BERT or Flair embeddings)
		:model: (str) pretrained model of BERT embedding
		:n_layers: (int) number of last layers of trained BERT embeddings to be chosen
		:stacked: (bool) if this embedding is a combination of more embeddings (True/False)
	:return:
		:embedding: (BertEmbeddings / StackedEmbeddings) embedding object
	'''
    out_layers = ','.join([str(-i) for i in range(1, n_layers + 1)])
    if not stacked:
        if type_.lower() == 'bert':
            embedding = BertEmbeddings(bert_model_or_path=model,
                                       layers=out_layers)
            return embedding
        else:
            emb = WordEmbeddings('glove')
    else:
        emb = BertEmbeddings(bert_model_or_path=model, layers=out_layers)

    flair_forward = FlairEmbeddings('news-forward-fast')
    flair_backward = FlairEmbeddings('news-backward-fast')
    embedding = StackedEmbeddings(
        embeddings=[flair_forward, flair_backward, emb])

    return embedding
Пример #3
0
    def _collect_embeddings(self, embeddings):
        embedders = {
            "flair-forward": FlairEmbeddings("multi-forward"),
            "flair-backward": FlairEmbeddings("multi-backward"),
            "charm-forward": CharLMEmbeddings("news-forward"),
            "charm-backward": CharLMEmbeddings("news-backward"),
            "glove": WordEmbeddings("glove"),
            "bert-small": BertEmbeddings("bert-base-uncased"),
            "bert-large": BertEmbeddings("bert-large-uncased"),
            "elmo-small": ELMoEmbeddings("small"),
            "elmo-large": ELMoEmbeddings("original"),
        }

        return [embedders[embedding] for embedding in embeddings]
Пример #4
0
def generate_topics_on_series(series):
    """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f

    Returns:
        [type]: [description]
    """
    validate_text(series)

    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings("news-forward")
    flair_embedding_backward = FlairEmbeddings("news-backward")
    bert_embedding = BertEmbeddings("bert-base-uncased")

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(series.index), 7168)).cuda()

    # fill tensor with embeddings
    i = 0
    for text in tqdm(series):
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

    X = X.cpu().detach().numpy()
    torch.cuda.empty_cache()

    return X
Пример #5
0
    def __init__(self, model: Optional[BertEmbeddings] = None):
        super(BertPretrained, self).__init__()

        if model is not None:
            self.model = model
        else:
            self.model = BertEmbeddings('bert-base-uncased')
Пример #6
0
def main():
    args = _parse_args()
    tsv_path = args.tsv_path

    embedding = BertEmbeddings('bert-base-cased')

    sentences = [[]]
    with open(tsv_path, 'r') as f:
        for i, l in enumerate(f.readlines()):
            if l.strip():
                token, *_ = l.strip().split('\t')
                sentences[-1].append(token.lower())
            else:
                sentences.append([])

    f_sentences = [Sentence(' '.join(s)) for s in sentences]

    for s in progressbar.progressbar(f_sentences):
        embedding.embed(s)

        for t in s:
            print('\t'.join(t.embedding.numpy().astype(str)))
        print()

        s.clear_embeddings()
Пример #7
0
def download_flair_models():
    w = WordEmbeddings("en-crawl")
    w = WordEmbeddings("news")
    w = FlairEmbeddings("news-forward-fast")
    w = FlairEmbeddings("news-backward-fast")
    w = FlairEmbeddings("mix-forward")
    w = BertEmbeddings("bert-base-uncased")
Пример #8
0
 def __init__(self, pipeline):
     self.mode = pipeline.mode
     self.type = pipeline.embedding_type
     embedders = []
     for component in pipeline.embedders:
         if "forward" in component or "backward" in component:
             embedders.append(FlairEmbeddings(component))
         elif "glove" in component:
             embedders.append(WordEmbeddings(component))
         elif "bert" in component:
             embedders.append(BertEmbeddings(component))
         elif len(component) == 2:
             # see https://github.com/zalandoresearch/flair/blob/master/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md#fasttext-embeddings
             embedders.append(WordEmbeddings(component))
             embedders.append(BytePairEmbeddings(component))
         else:
             raise ValueError(f"unknown embedder: {component}")
     if self.type == "document":
         self.embedder = self._make_doc_embedder(pipeline, embedders)
     elif self.type == "word":
         self.embedder = StackedEmbeddings(embedders)
     elif self.type == "both":
         self.embedders = [
             self._make_doc_embedder(pipeline, embedders),
             StackedEmbeddings(embedders),
         ]
     else:
         raise ValueError(
             f"Innapropriate embedding type {pipeline.embedding_type}, "
             "should be 'word', 'document', or 'both'.")
Пример #9
0
def create_embeddings(params):
    embedding_type = params["embedding_type"]
    assert embedding_type in ["bert", "flair", "char"]
    if embedding_type == "bert":
        bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"],
                                        pooling_operation="mean")

        embedding_types: List[TokenEmbeddings] = [bert_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "flair":
        glove_embedding = WordEmbeddings(
            '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim')
        word2vec_embedding = WordEmbeddings(
            '/opt/kanarya/huawei_w2v/vector.gensim')
        fast_text_embedding = WordEmbeddings('tr')
        char_embedding = CharacterEmbeddings()

        # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
        embedding_types: List[TokenEmbeddings] = [
            fast_text_embedding, glove_embedding, word2vec_embedding,
            char_embedding
        ]
        # embedding_types: List[TokenEmbeddings] = [custom_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "char":
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=[CharacterEmbeddings()])
    else:
        embeddings = None

    return embeddings
Пример #10
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                print(
                    f"Corresponding flair embedding module not found for {model_name_or_path}"
                )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Пример #11
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Пример #12
0
 def transform(self, X: dt.Frame):
     X.replace([None, math.inf, -math.inf], self._repl_val)
     from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
     if self.embedding_name in ["glove", "en"]:
         self.embedding = WordEmbeddings(self.embedding_name)
     elif self.embedding_name in ["bert"]:
         self.embedding = BertEmbeddings()
     self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
     output = []
     X = X.to_pandas()
     text1_arr = X.iloc[:, 0].values
     text2_arr = X.iloc[:, 1].values
     for ind, text1 in enumerate(text1_arr):
         try:
             text1 = Sentence(str(text1).lower())
             self.doc_embedding.embed(text1)
             text2 = text2_arr[ind]
             text2 = Sentence(str(text2).lower())
             self.doc_embedding.embed(text2)
             score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                       text2.get_embedding().reshape(1,
                                                                     -1))[0,
                                                                          0]
             output.append(score)
         except:
             output.append(-99)
     return np.array(output)
Пример #13
0
 def _get_vectorizer(vectorizer, training_mode, features_file="features.pkl"):
     token_pattern = r'\S+'
     if training_mode:
         vocabulary = None
     else:
         if vectorizer not in [HashingVectorizer.__name__, DocumentPoolEmbeddings.__name__]:
             vocabulary = pickle_manager.load(features_file)
     if vectorizer == TfidfVectorizer.__name__:
         v = TfidfVectorizer(input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=[], token_pattern=token_pattern,
                 ngram_range=(1,1), max_df=1.0, min_df=1, max_features=None,
                 vocabulary=vocabulary, binary=False, dtype=np.float64, norm='l2',
                 use_idf=True, smooth_idf=True, sublinear_tf=False)
     elif vectorizer == CountVectorizer.__name__:
         v = CountVectorizer(input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, stop_words=[],
                 token_pattern=token_pattern, ngram_range=(1, 1),
                 analyzer='word', max_df=1.0, min_df=1, max_features=None,
                 vocabulary=vocabulary, binary=False, dtype=np.int64)
     elif vectorizer == HashingVectorizer.__name__:
         v = HashingVectorizer(input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, stop_words=[],
                 token_pattern=token_pattern, ngram_range=(1, 1),
                 analyzer='word', n_features=1048576, binary=False,
                 norm='l2', alternate_sign=True, non_negative=False,
                 dtype=np.float64)
     elif vectorizer == DocumentPoolEmbeddings.__name__:
         v = DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased')])
     else:
         raise "Invalid vectorizer."
     return v
    def __init__(self, gpu, bert_embeddings_dim, freeze_bert_embeddings=True):
        super(LayerBertEmbeddings, self).__init__(gpu)
        self.gpu = gpu
        self.bert_embeddings_dim = bert_embeddings_dim
        self.freeze_char_embeddings = freeze_bert_embeddings

        # self.bert = BertModel.from_pretrained("/home/jlfu/saved_pytorch_bert/en_base_uncased/model")
        # # for p in self.bert.parameters():
        # #     p.requires_grad = True
        # self.tokenizer = BertTokenizer.from_pretrained('/home/jlfu/saved_pytorch_bert/en_base_uncased/vocab.txt')

        # self.bert = BertModel.from_pretrained("/home/jlfu/model/cased_L-12_H-768_A-12/bert_model.ckpt.gz")
        # self.bert = BertModel.from_pretrained('bert-base-cased')

        self.bert = BertModel.from_pretrained(
            "/home/jlfu/model/cased_L-12_H-768_A-12/bert-base-cased.tar.gz")
        self.tokenizer = BertTokenizer.from_pretrained(
            '/home/jlfu/model/cased_L-12_H-768_A-12/vocab.txt')

        self.Wbert = nn.Linear(768, bert_embeddings_dim)
        self.output_dim = 768

        # self.output_dim =  3072

        self.bert_embedding = BertEmbeddings(
            "bert-base-cased"
        )  # bert-base-cased,  bert-base-multilingual-cased
Пример #15
0
def compute_query_embedding_pretrained_models(query):
    """
        learn bert sentence embeddings using pre-trained word embedding model for an arbitrary question
       :param query: String :: arbitrary sentence
       :return: n-dimensional embedding
    """
    query = query
    up = os.path.abspath(os.path.dirname(__file__))
    conf_path = os.path.join(up, 'config.ini')
    embedding_name = 'bert'
    transformer_model_or_path = get_config(
        'Transformers', '{}_model_or_path'.format(embedding_name), conf_path,
        0)
    transformer_layers = get_config('Transformers',
                                    '{}_layers'.format(embedding_name),
                                    conf_path, 0)
    transformer_pooling_operation = get_config(
        'Transformers', '{}_pooling_operation'.format(embedding_name),
        conf_path, 0)
    transformer_use_scalar_mix = get_config(
        'Transformers', '{}_use_scalar_mix'.format(embedding_name), conf_path,
        1)

    word_embedding = BertEmbeddings(
        bert_model_or_path=transformer_model_or_path,
        layers=transformer_layers,
        pooling_operation=transformer_pooling_operation,
        use_scalar_mix=transformer_use_scalar_mix)
    document_embeddings = DocumentPoolEmbeddings([word_embedding],
                                                 fine_tune_mode='none')
    query_embedding = compute_pretrained_individual_transformer_embedding(
        query, word_embedding, document_embeddings)
    return query + ',' + str(list(query_embedding))[1:-1]
Пример #16
0
    def construct_embeddings(
            self, embeddings: Sequence[str]
    ) -> List[flair.embeddings.TokenEmbeddings]:
        ret = list()
        for name in embeddings:
            emb_type, emb_name, o_file, w_file = FlairDataset.extract_embedding_configs(
                name)
            self.embeddings_type = emb_type.lower()

            if name.startswith('bert'):
                embs = BertEmbeddings(emb_name)
                self.bert_tokenizer = embs.tokenizer
                ret.append(embs)
            elif name.startswith('flair'):
                ret.append(FlairEmbeddings(emb_name))
            elif name.startswith('elmo'):
                embs = ELMoEmbeddings(emb_name,
                                      options_file=o_file,
                                      weight_file=w_file)
                ret.append(embs)
            elif name.startswith('word'):
                ret.append(WordEmbeddings(emb_name))
            else:
                raise ValueError('Invalid Embedding Type: "{}"'.format(name))

        return ret
Пример #17
0
def get_bert(model_name,
             layers='-1,-2,-3,-4',
             pooling_op='first',
             scalar_mix=False):
    return BertEmbeddings(model_name,
                          layers=layers,
                          pooling_operation=pooling_op,
                          use_scalar_mix=scalar_mix)
Пример #18
0
	def __init__(self, len, emb='en'):
		"""
		Args:
			len (int): max length for the model input
			lang (str, optional): embedding language. Defaults to 'en'.
		"""		
		if emb=='en':	self.embedder = BertEmbeddings("distilbert-base-uncased")
		self.MAX_LEN = len
Пример #19
0
    def __init__(self, MAX_WORD_N=150, MAX_SENT_N=30, MAX_WORD_SENT_N=300, alber_model="albert-base-v2") -> None:
        super().__init__()
        albert = BertEmbeddings(bert_model_or_path=alber_model)
        self.albert_embedding = DocumentPoolEmbeddings([albert])
        self.MAX_WORD_N = MAX_WORD_N
        self.MAX_SENT_N = MAX_SENT_N
        self.MAX_WORD_SENT_N = MAX_WORD_SENT_N

        self.sentence_piecer = MySentencePiecer()
Пример #20
0
def args_init(args):
    # initialize word2vec
    args.word2vec = KeyedVectors.load_word2vec_format('data/mymodel-new-5-%d' %
                                                      args.model_dim,
                                                      binary=True)

    # initialize contextual embedding dimensions
    if args.contextual_embedding == 'word2vec':
        args.word_dim = args.tag_dim = args.dis_dim = 50
        args.stacked_embeddings = 'word2vec'
    elif args.contextual_embedding == 'elmo':  #glove + elmo
        args.word_dim = args.tag_dim = args.dis_dim = 868
        ## stacked embeddings
        # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
        args.stacked_embeddings = StackedEmbeddings(
            [WordEmbeddings('glove'),
             ELMoEmbeddings('small')])

    elif args.contextual_embedding == 'bert':  #glove + bert
        args.word_dim = args.tag_dim = args.dis_dim = 3172
        args.stacked_embeddings = StackedEmbeddings(
            [WordEmbeddings('glove'),
             BertEmbeddings('bert-base-uncased')])
        args.batch_size = 8

    elif args.contextual_embedding == 'flair':  #glove + flair-forward + flair-backward
        args.word_dim = args.tag_dim = args.dis_dim = 4196
        args.stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
            FlairEmbeddings('mix-forward', chars_per_chunk=128),
            FlairEmbeddings('mix-backward', chars_per_chunk=128)
        ])
        if args.agent_mode == 'act':
            args.batch_size = 8
        else:
            args.batch_size = 8

    elif args.contextual_embedding == 'glove':  # not tested
        args.word_dim = args.tag_dim = args.dis_dim = 100
        args.stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
        ])

    # weights loaded, set exploration rate to minimum
    if args.load_weights:  # 1 to 0.1. decayed to minimum.
        args.exploration_rate_start = args.exploration_rate_end

    # agent mode arguments, set number of words to 100
    if args.agent_mode == 'arg':
        args.num_words = args.context_len
        args.display_training_result = 0

    args.result_dir = 'results/%s_%s_%s' % (args.domain, args.agent_mode,
                                            args.contextual_embedding)

    return args
Пример #21
0
def collect_features(embeddings):
    for embedding in embeddings:
        if embedding in {"fasttext"}:
            yield WordEmbeddings("de")
        elif embedding in {"bert"}:
            yield BertEmbeddings("bert-base-multilingual-cased", layers="-1")
        elif embedding in {"flair-forward"}:
            yield FlairEmbeddings("german-forward")
        elif embedding in {"flair-backward"}:
            yield FlairEmbeddings("german-backward")
Пример #22
0
def load_flair(mode = 'flair'):
    if mode == 'flair':
        stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
            PooledFlairEmbeddings('news-forward', pooling='min'),
            PooledFlairEmbeddings('news-backward', pooling='min')
        ])
    else:##bert
        stacked_embeddings = BertEmbeddings('bert-base-uncased')  ##concat last 4 layers give the best
    return stacked_embeddings
Пример #23
0
def get_flair_class(embed_type):
    """
    Return the correct flair class for the embed type
    """
    if embed_type == 'elmo':
        fl_embed = ELMoEmbeddings()
    elif embed_type == 'bert':
        fl_embed = BertEmbeddings()
    else:
        fl_embed = FlairEmbeddings(embed_type)
    return fl_embed
    def _get_stacked_embeddings(self) -> StackedEmbeddings:
        layers = ",".join(str(layer) for layer in self.experiment.layers)
        pooling_operation = self.experiment.pooling_operation

        token_embeddings = []

        for embedding in self.experiment.embeddings:
            if embedding.startswith("roberta"):
                token_embeddings.append(
                    RoBERTaEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif (embedding.startswith("bert")
                  or embedding.startswith("distilbert")
                  or embedding.startswith("spanbert")):
                token_embeddings.append(
                    BertEmbeddings(
                        bert_model_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("elmo"):
                model_name = embedding.split("-")[-1]
                token_embeddings.append(ELMoEmbeddings(model=model_name))
            elif embedding.startswith("gpt2"):
                token_embeddings.append(
                    OpenAIGPT2Embeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlm"):
                token_embeddings.append(
                    XLMEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlnet"):
                token_embeddings.append(
                    XLNetEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))

        return StackedEmbeddings(embeddings=token_embeddings)
Пример #25
0
def get_flair_bert_embeddings(words):

    # Experimental -- not tested

    from flair.embeddings import BertEmbeddings

    bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

    sentence = Sentence(words)
    bert_embedding.embed(sentence)

    return (sentence)
Пример #26
0
def get_sentence_embeddings(texts):
    word_embeddings = [BertEmbeddings('bert-large-uncased')]
    document_embeddings = DocumentPoolEmbeddings(word_embeddings)

    sentences = [Sentence(text) for text in texts]
    embeddings = []

    for sentence in sentences:
        document_embeddings.embed(sentence)
        sentence_embedding = sentence.get_embedding().numpy().reshape(-1)
        embeddings.append(sentence_embedding)

    return np.array(embeddings)
Пример #27
0
    def dump_bert_vecs(df, dump_dir):
        print("Getting BERT vectors...")
        embedding = BertEmbeddings('bert-base-uncased')
        word_counter = defaultdict(int)
        stop_words = set(stopwords.words('english'))
        stop_words.add("would")
        except_counter = 0

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        for index, row in df.iterrows():
            if index % 100 == 0:
                print("Finished sentences: " + str(index) + " out of " +
                      str(len(df)))
            #all sentences are undercase now
            line = row["sentence"].lower()
            sentences = sent_tokenize(line)
            for sentence_ind, sent in enumerate(sentences):
                tokenized_text = tokenizer.tokenize(sent)
                if len(tokenized_text) > 512:
                    print('sentence too long for Bert: truncating')
                    sentence = Sentence(' '.join(sent[:512]),
                                        use_tokenizer=True)
                else:
                    sentence = Sentence(sent, use_tokenizer=True)
                try:
                    embedding.embed(sentence)
                except Exception as e:
                    except_counter += 1
                    print("Exception Counter while getting BERT: ",
                          except_counter, sentence_ind, index, e)
                    print(sentence)
                    continue
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    word = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if word in stop_words or "/" in word or len(word) == 0:
                        continue
                    word_dump_dir = dump_dir + word
                    os.makedirs(word_dump_dir, exist_ok=True)
                    fname = word_dump_dir + "/" + str(
                        word_counter[word]) + ".pkl"
                    word_counter[word] += 1
                    vec = token.embedding.cpu().numpy()
                    try:
                        with open(fname, "wb") as handler:
                            pickle.dump(vec, handler)
                    except Exception as e:
                        except_counter += 1
                        print("Exception Counter while dumping BERT: ",
                              except_counter, sentence_ind, index, word, e)
Пример #28
0
    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        model_name_or_path: str = "bert-base-cased",
    ) -> List[Sentence]:
        """ Produces embeddings for text

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        * **model_name_or_path** - The hosted model name key or model path
        **return** - A list of Flair's `Sentence`s
        """
        # Convert into sentences
        if isinstance(text, str):
            sentences = Sentence(text)
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        else:
            sentences = text

        # Load correct Embeddings module
        if not self.models[model_name_or_path]:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.models[model_name_or_path] = BertEmbeddings(
                    model_name_or_path)
            elif "roberta" in model_name_or_path:
                self.models[model_name_or_path] = RoBERTaEmbeddings(
                    model_name_or_path)
            elif "gpt2" in model_name_or_path:
                self.models[model_name_or_path] = OpenAIGPT2Embeddings(
                    model_name_or_path)
            elif "xlnet" in model_name_or_path:
                self.models[model_name_or_path] = XLNetEmbeddings(
                    model_name_or_path)
            elif "xlm" in model_name_or_path:
                self.models[model_name_or_path] = XLMEmbeddings(
                    model_name_or_path)
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.models[model_name_or_path] = FlairEmbeddings(
                    model_name_or_path)
            else:
                try:
                    self.models[model_name_or_path] = WordEmbeddings(
                        model_name_or_path)
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )
                return Sentence("")
        embedding = self.models[model_name_or_path]
        return embedding.embed(sentences)
Пример #29
0
    def train(corpus):
        print(corpus)

        # 2. what tag do we want to predict?
        tag_type = 'ner'

        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary.idx2item)

        # 4. initialize embeddings
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings()
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        # 5. initialize sequence tagger

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type,
                                                use_crf=True)

        # 6. initialize trainer

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        checkpoint = 'resources/taggers/presidio-ner/checkpoint.pt'
        # trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
        trainer.train('resources/taggers/presidio-ner',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150,
                      checkpoint=True)

        sentence = Sentence('I am from Jerusalem')
        # run NER over sentence
        tagger.predict(sentence)

        print(sentence)
        print('The following NER tags are found:')

        # iterate over entities and print
        for entity in sentence.get_spans('ner'):
            print(entity)
Пример #30
0
    def __init__(self, train_path, dev_path, test_path, dataset):
        loaddata = LoadData(train_path, dev_path, test_path)
        a = loaddata.conllu_counter[dataset]
        counter = loaddata.counter_process(a)
        embedding = BertEmbeddings('bert-base-uncased')

        for i in trange(len(counter)):
            counter[i].sentence_emb = tuple(
                self.Embedding(counter[i].sentence, embedding))
            print(torch.tensor(counter[i].sentence_emb).shape)
            counter[i].index = i
        self.data = counter
        self.len = len(self.data)
        print(dataset, self.len)