示例#1
0
def stacking_embedding(df):
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_news_forward = FlairEmbeddings('news-forward')
    flair_embedding_news_backward = FlairEmbeddings('news-backward')
    bert_embedding = BertEmbeddings()
    elmo_embedding = ELMoEmbeddings()


    stacked_embeddings = StackedEmbeddings([
        glove_embedding, 
        flair_embedding_news_forward, 
        flair_embedding_news_backward,
        bert_embedding,
        elmo_embedding
    ])
    
    for index, row tqdm(df.iterrows(), total=len(df), desc='Embedding'):
        sentence = Sentence(row['name'])
        token_series = set()
        for token in sentence:
            token_series.add
示例#2
0
 def _train_model(self):
     # type: () -> None
     corpus = ClassificationCorpus(
         Path(__path_to_base__),
         test_file=os.path.basename(self.path_to_test),
         dev_file=os.path.basename(self.path_to_dev),
         train_file=os.path.basename(self.path_to_train))
     word_embeddings = [
         ELMoEmbeddings('original'),
         FlairEmbeddings('news-forward-fast'),
         FlairEmbeddings('news-backward-fast')
     ]
     document_embeddings = DocumentRNNEmbeddings(
         word_embeddings,
         hidden_size=512,
         reproject_words=True,
         reproject_words_dimension=256)
     classifier = TextClassifier(
         document_embeddings,
         label_dictionary=corpus.make_label_dictionary(),
         multi_label=False)
     trainer = ModelTrainer(classifier, corpus)
     trainer.train(__path_to_base__, max_epochs=10)
示例#3
0
文件: train.py 项目: vamshinr/NER
def hyper_opt(corpus):
    print("hyper_opt is started")
    # define your search space
    search_space = SearchSpace()

    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([
                             WordEmbeddings('en'),
                             WordEmbeddings('glove'),
                             CharacterEmbeddings(),
                             FlairEmbeddings('news-forward'),
                             FlairEmbeddings('news-backward'),
                             ELMoEmbeddings()
                         ])
                     ])

    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
    #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64])

    # create the parameter selector
    param_selector = SequenceTaggerParamSelector(
        corpus,
        'ner',
        #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection',
        model_path,
        max_epochs=50,
        training_runs=2,
        optimization_value=OptimizationValue.DEV_SCORE)

    # start the optimization
    param_selector.optimize(search_space, max_evals=100)
示例#4
0
import re

# Create Flask application here
app = Flask(__name__)

# Load blueprint for flask
bp = Blueprint('note', __name__)

STOP_WORDS = set(ENGLISH_STOP_WORDS)
STOP_WORDS.remove('no')
STOP_WORDS.remove('not')

# Loading lemmatizer for BLEU-socre and Stanford NLP Pipeline
lemmatizer = WordNetLemmatizer()
nlp = stanfordnlp.Pipeline(processors="tokenize,mwt,pos,depparse")
embedding = ELMoEmbeddings('pubmed')

# Define upload folder location
# UPLOAD_FOLDER = '/home/TheLumino/UCSF_NLP_UI/Uploads'
UPLOAD_FOLDER = 'Uploads'

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# Load word2vec and random forest model
# currentmodel = Word2Vec.load('/home/TheLumino/UCSF_NLP_UI/flaskr/cosine_model/cosine_similarity_metric')
# rf_model = pickle.load(open('/home/TheLumino/UCSF_NLP_UI/flaskr/cosine_model/random_forest_confidence_score.sav', 'rb'))
currentmodel = Word2Vec.load('flaskr/cosine_model/bestmodel')
rf_model = pickle.load(open('flaskr/cosine_model/random_forest_confidence_score.sav', 'rb'))

# set UMLS database links for quick access from UI
rx_url = 'https://mor.nlm.nih.gov/RxNav/search?searchBy=RXCUI&searchTerm='
示例#5
0
# ####

cbc = CamembertConfig(output_hidden_states=True)
bert = CamembertModel(cbc)
bert.from_pretrained("camembert-base")
bert = bert.eval()
bert = bert.to(device)
bert_tok = CamembertTokenizer.from_pretrained("camembert-base")

app = Flask(__name__)

# create a StackedEmbedding object that combines glove and forward/backward flair embeddings

SIZE_EMBED = -1
adaptive_pool = nn.AdaptiveAvgPool1d(SIZE_EMBED) if SIZE_EMBED > 0 else None
embedder = ELMoEmbeddings("small")
# bert_model_or_path="distilbert-base-uncased",
#   pooling_operation="mean", use_scalar_mix=True)


@app.route('/vectorize', methods=['POST'])
def vectorize():
    tokens = request.json["tokens"]

    embeddings = []
    # print("Call")
    for m in tokens:
        mot = Sentence(m)
        embedder.embed(mot)
        embed = mot[0].embedding
示例#6
0
sentence = Sentence('The grass is green .')
flair_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Bert Embedding加载训练
embedding = BertEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Elmo Embedding加载训练
embedding = ELMoEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#混合Embedding加载训练
stacked_embeddings = StackedEmbeddings([WordEmbeddings('model/glove.gensim'), FlairEmbeddings('model/news-forward-0.4.1.pt')])
sentence = Sentence('The grass is green .')
stacked_embeddings.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Character Embeddings和BytePairEmbeddings,无法翻墙则运行是下载会报错
示例#7
0
from flair.models import SequenceTagger

# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ ELMoEmbeddings('original') ]),
    StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25)
search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])


# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
示例#8
0
test_data = all_annos[4500:6200]
dev_data = all_annos[6200:]

search_space = SearchSpace()

#Create or embedding stacks
#Flair recommends adding GLoVe to their character-level embeddings

flair_normal = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('mix-forward'),
    FlairEmbeddings('mix-backward')
])

bert = BertEmbeddings()
elmo = ELMoEmbeddings('original')
flair_pooled = StackedEmbeddings([
    WordEmbeddings('glove'),
    PooledFlairEmbeddings('mix-forward'),
    PooledFlairEmbeddings('mix-backward')
])

search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[bert, elmo, flair_normal, flair_pooled])

#other hyperparams are kept fixed for this excercise.
#Add to the lists to add to grid
#unfortunately for small grids, Flair picks random search instead of true
#grid search
示例#9
0
def init_embeddings(corpus_name, embedding_type):
    """
    Initializes embeddings for a given corpus.

    Parameters:
        corpus_name (str): name of the corpus used to load proper embeddings
        embedding_type (str): type of embeddings (e.g. flair, elmo, bert, word+char)
    
    Returns:
        tuple(StackedEmbeddings, bool): loaded embeddings
    """

    from typing import List
    from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings
    from flair.embeddings import FlairEmbeddings
    from flair.embeddings import BertEmbeddings, ELMoEmbeddings
    from flair.embeddings import WordEmbeddings, CharacterEmbeddings

    embedding_types: List[TokenEmbeddings] = []

    if corpus_name in ['conll03_en']:
        if embedding_type == 'flair':
            embedding_types.append(WordEmbeddings('glove'))
            embedding_types.append(FlairEmbeddings('news-forward'))
            embedding_types.append(FlairEmbeddings('news-backward'))
            embeddings_in_memory = True
        elif embedding_type == 'bert':
            embedding_types.append(
                BertEmbeddings(bert_model_or_path='bert-base-cased'))
            #embedding_types.append(BertEmbeddings(bert_model_or_path='bert-large-cased'))
            embeddings_in_memory = True
        elif embedding_type == 'elmo':
            embedding_types.append(ELMoEmbeddings())
            embeddings_in_memory = True
        elif embedding_type == 'word+char':
            # similar to Lample et al. (2016)
            embedding_types.append(WordEmbeddings('glove'))
            embedding_types.append(CharacterEmbeddings())
            embeddings_in_memory = False  # because it contains a char model (problem with deepcopy)
        else:
            log.error(f"no settings for '{embedding_type}'!")
            exit(EXIT_FAILURE)

    elif corpus_name in ["conll03_de", "germeval"]:
        if embedding_type == 'flair':
            embedding_types.append(WordEmbeddings('de'))
            embedding_types.append(FlairEmbeddings('german-forward'))
            embedding_types.append(FlairEmbeddings('german-backward'))
            embeddings_in_memory = True
        elif embedding_type == 'word+char':
            # similar to Lample et al. (2016)
            embedding_types.append(WordEmbeddings('de'))
            embedding_types.append(CharacterEmbeddings())
            embeddings_in_memory = False  # because it contains a char model (problem with deepcopy)
        else:
            log.error(f"no settings for '{embedding_type}'!")
            exit(EXIT_FAILURE)
    else:
        log.error(f"unknown corpus or embeddings '{corpus_name}'!")
        exit(EXIT_FAILURE)

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    log.info("'{}' function finished!".format(sys._getframe().f_code.co_name))

    return embeddings, embeddings_in_memory
示例#10
0
cuda_device = 0 if str(device) != 'cpu' else -1

model = allennlp.commands.elmo.ElmoEmbedder(
    options_file='path_to_pretrain_elmo_options.json',
    weight_file='path_to_pretrain_elmo_weights.hdf5',
    cuda_device=cuda_device)
###

embedding_types: List[TokenEmbeddings] = [
    FlairEmbeddings('multi-forward'),
    FlairEmbeddings('multi-backward'),
    FlairEmbeddings('path_to_pretrain_flair_forward.pt'),
    FlairEmbeddings('path_to_pretrain_flair_backward.pt'),
    WordEmbeddings('glove'),
    ELMoEmbeddings('medium'),
    ELMoEmbeddings('pubmed'),
    OwnELMoEmbeddings(model),
    UMLSEmbedding(),
    CCREmbedding()
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

trainer: ModelTrainer = ModelTrainer(tagger, corpus)
data['label'] = '__label__' + data['label'].astype(str)

#train-test split
data.iloc[0:int(len(data)*0.8)].to_csv(PATH/'flair/train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*1)].to_csv(PATH/'flair/test.csv', sep='\t', index = False, header = False)


corpus = ClassificationCorpus(Path('/content/drive/My Drive/emnlp/flair/'), test_file='test.csv', dev_file='test.csv',train_file='train.csv')

print(corpus.obtain_statistics())


## use any pretrained stacked embedding from the FLAIR Framework
# embedding = RoBERTaEmbeddings()
# embedding = BertEmbeddings('bert-base-uncased')
embedding = ELMoEmbeddings('small')

#stack them with other embeddings
word_embeddings = [
            embedding,
#             FlairEmbeddings('news-forward',use_cache=True),
#             FlairEmbeddings('news-backward',use_cache=True),
        ]

#apply document LSTM to the stacked embeddings
document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
#         hidden_size=512,
#         reproject_words=True,
#         reproject_words_dimension=256,
    )
示例#12
0
use_flair = False
use_elmo = False
use_bert = False
mini_batch_size = 32
word_embeddings = []

if use_glove:
    word_embeddings.append(WordEmbeddings('glove'))
    word_embeddings.append(CharacterEmbeddings())
if use_cui2vec:
    word_embeddings.append(WordEmbeddings('./cui2vec_embed_vectors.bin'))
if use_flair:
    word_embeddings.append(FlairEmbeddings('./forward-lm.pt'))
    word_embeddings.append(FlairEmbeddings('./backward-lm.pt'))
if use_elmo:
    word_embeddings.append(ELMoEmbeddings('pubmed'))
if use_bert:
    word_embeddings.append(BertEmbeddings('./bert-base-clinical-cased'))
    mini_batch_size = 8

stacked_word_embeddings = StackedEmbeddings(word_embeddings)

from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            rnn_type='LSTM',
                                            bidirectional=True,
                                            hidden_size=512,
                                            reproject_words=True,
                                            reproject_words_dimension=256)
示例#13
0
parser.add_argument(
    '--model_name',
    default='large',
    action='store',
)

parser.add_argument(
    '--lm_emb_save_path',
    default='./wv/elmo.emb.pkl',
    action='store',
)

args = parser.parse_args()

embedding = ELMoEmbeddings(args.model_name)

flag = args.dataset
dataset = []
with open(f'./datasets/unified/train.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/valid.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/test.{flag}.json') as f:
    dataset += json.load(f)

bert_emb_dict = {}
for item in tqdm(dataset):
    tokens = tuple(item['tokens'])
    s = form_sentence(tokens)
    embedding.embed(s)
示例#14
0
corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file="train.txt",
                              dev_file="test.txt",
                              test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[
                     StackedEmbeddings([ELMoEmbeddings('original')]),
                     StackedEmbeddings(
                         [ELMoEmbeddings('original'),
                          CharacterEmbeddings()])
                 ])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25)
search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])

# initialise embeddings

param_selector = SequenceTaggerParamSelector(
示例#15
0
     test_file='test_.tsv',
     dev_file='dev.tsv',
     train_file='train.tsv')
 # way to select language model
 model_selector = {
     "Glove": [WordEmbeddings('glove')],
     "FastText": [WordEmbeddings('en-news')],
     "BPE": [BytePairEmbeddings('en')],
     "FlairFast": [
         FlairEmbeddings('news-forward-fast'),
         FlairEmbeddings('news-backward-fast')
     ],
     "FlairNews":
     [FlairEmbeddings('news-forward'),
      FlairEmbeddings('news-backward')],
     "ElmoOriginal": [ELMoEmbeddings('original')],
     'Bert': [BertEmbeddings('large-uncased')],
     'BertLS': [
         BertEmbeddings(bert_model_or_path='bert-large-uncased',
                        layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                        "15,16,17,18,19,20,21,22,23,24",
                        use_scalar_mix=True)
     ],
     "RoBERTa": [RoBERTaEmbeddings('roberta-base')],
     "RoBERTaL": [RoBERTaEmbeddings('roberta-large')],
     "RoBERTaLS": [
         RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-large",
                           layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,"
                           "15,16,17,18,19,20,21,22,23,24",
                           use_scalar_mix=True)
     ],
示例#16
0
)
# # 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    CharacterEmbeddings(
        path_to_char_dict=
        "/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/abs/flair-custom/custom_dict.pkl"
    ),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # CharLMEmbeddings('news-forward',use_cache=True),
    ELMoEmbeddings('elmo-small'),
    # BertEmbeddings(),
    # FlairEmbeddings('news-backward-fast'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

#5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        rnn_layers=2,
                                        tag_type=tag_type,
                                        use_crf=True)
示例#17
0
def get_elmo(model_name):
    return ELMoEmbeddings(model_name)
示例#18
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
示例#19
0
    dev_file=None)
#corpus.downsample(0.1)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)
print(corpus.train[0].to_tagged_string('ner'))

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = []

if embedding == "ep":
    embedding_types = [WordEmbeddings('glove'), ELMoEmbeddings('pubmed')]
elif embedding == "fp":
    embedding_types = [
        WordEmbeddings('glove'),
        FlairEmbeddings('./forward-lm.pt'),
        FlairEmbeddings('./backward-lm.pt')
    ]
elif embedding == "fpd":
    embedding_types = [
        WordEmbeddings('glove'),
        FlairEmbeddings('pubmed-forward'),
        FlairEmbeddings('pubmed-backward')
    ]
elif embedding == "cb":
    embedding_types = [
        WordEmbeddings('glove'),
    with timer('turian'):
        embedding = WordEmbeddings('turian')
        result = w2v_flair(train["Description"], embedding, name="turian")
        result.to_feather("../feature/turian_flair.feather")

    with timer('twitter'):
        embedding = WordEmbeddings('twitter')
        result = w2v_flair(train["Description"], embedding, name="twitter")
        result.to_feather("../feature/twitter_flair.feather")

    with timer('news'):
        embedding = FlairEmbeddings('news-forward')
        result = w2v_flair(train["Description"], embedding, name="news_flair")
        result.to_feather("../feature/news_flair.feather")

    with timer('char'):
        embedding = CharacterEmbeddings()
        result = w2v_flair(train["Description"], embedding, name="char")
        result.to_feather("../feature/char_flair.feather")

    with timer('byte_pair'):
        embedding = BytePairEmbeddings('en')
        result = w2v_flair(train["Description"], embedding, name="byte_pair")
        result.to_feather("../feature/byte_pair_flair.feather")

    with timer('elmo'):
        embedding = ELMoEmbeddings('medium')
        result = w2v_flair(train["Description"], embedding, name="elmo")
        result.to_feather("../feature/elmo_flair.feather")
示例#21
0
                premise_text = remove_punctuations(premise_text)
            premise_sentence = Sentence(premise_text)
            document_embedding.embed(premise_sentence)
            embedded_premise = premise_sentence.get_embedding().detach().numpy(
            ).tolist()
            embedded_premises[premise[2]] = embedded_premise
            argument_uid = premise[0]
        embedded_arguments[argument_uid] = [
            embedded_conclusion, embedded_premises
        ]

        save_embedding(embedded_arguments, file_name)


if __name__ == '__main__':
    elmo_embedding = ELMoEmbeddings()
    compute_embedding(elmo_embedding,
                      remove_punctuation=True,
                      file_name="elmo_embeddings_without_punctuation.json")
    compute_embedding(elmo_embedding,
                      remove_punctuation=False,
                      file_name="elmo_embeddings_with_punctuation.json")

    bert_embedding = BertEmbeddings()
    compute_embedding(bert_embedding,
                      remove_punctuation=True,
                      file_name="bert_embeddings_without_punctuation.json")
    compute_embedding(bert_embedding,
                      remove_punctuation=False,
                      file_name="bert_embeddings_with_punctuation.json")
示例#22
0
def call_test(skip_list=[],
              test_name="",
              langs=[],
              template="",
              params={},
              ANALOGIES_FILE="",
              ANALOGIES_DIR="",
              EMBEDDINGS_DIR=""):
    logger.debug('call_test : {0}'.format(test_name))
    if template == 'sick':
        for lang in langs:
            if test_already_done(test_name, lang):
                continue
            params["flair_model"] = ELMoEmbeddings('original')
            model = Embedding(**params)
            measure = get_measure(model, 'en', test_name, sick=True)
            yield measure

    elif template == 'bert':
        for lang in langs:
            if test_already_done(test_name, lang):
                continue
            params["bert_model"] = BertClient()
            model = Embedding(**params)
            measure = get_measure(model, lang, test_name)
            yield measure

    elif template == 'flair':
        for lang in langs:
            if test_already_done(test_name, lang):
                continue
            params["flair_model"] = ELMoEmbeddings('pt')
            model = Embedding(**params)
            measure = get_measure(model, lang, test_name)
            yield measure

    elif template == 'flair-custom-1' or template == 'flair-custom-2':
        for lang in langs:
            if test_already_done(test_name, lang):
                continue
            if template == 'flair-custom-1':
                params["flair_model"] = ELMoEmbeddings(
                    options_file="../embeddings/elmo/options.json",
                    weight_file="../embeddings/elmo/elmo_pt_weights.hdf5")
            elif template == 'flair-custom-2':
                params["flair_model"] = ELMoEmbeddings(
                    options_file="../embeddings/elmo/options_dgx1.json",
                    weight_file="../embeddings/elmo/elmo_pt_weights_dgx1.hdf5")
            model = Embedding(**params)
            measure = get_measure(model, lang, test_name)
            yield measure

    elif template == 'gensim' or template == 'flair-gensim' or template == 'custom-flair-gensim-1' or template == 'custom-flair-gensim-2' or template == "flair-gensim-local":
        assert (EMBEDDINGS_DIR != None)
        logger.debug("Template: " + template)
        logger.debug("EMBEDDINGS_DIR: " + EMBEDDINGS_DIR)
        logger.debug("skip-list: " + str(skip_list))
        for fname in get_NILC(EMBEDDINGS_DIR):
            logger.debug("Embedding: " + fname)
            for item in skip_list:
                if item in fname:
                    logger.debug("SKIP - skip_list - {0}".format(fname))
                    break
            else:
                logger.debug("RUN - {0}".format(fname))
                t = test_name + '_' + fname
                for lang in langs:
                    if test_already_done(t, lang):
                        continue
                    emb = KeyedVectors.load(fname)
                    params["gensim_model"] = emb
                    if template == 'flair-gensim':
                        params["flair_model"] = ELMoEmbeddings('pt')
                    elif template == 'flair-gensim-local':
                        params["flair_model"] = ELMoEmbeddings(
                            options_file=
                            "../embeddings/allen_elmo/elmo_pt_options.json",
                            weight_file=
                            "../embeddings/allen_elmo/elmo_pt_weights.hdf5")
                    elif template == 'custom-flair-gensim-1':
                        params["flair_model"] = ELMoEmbeddings(
                            options_file="../embeddings/elmo/options.json",
                            weight_file=
                            "../embeddings/elmo/elmo_pt_weights.hdf5")
                    elif template == 'custom-flair-gensim-2':
                        params["flair_model"] = ELMoEmbeddings(
                            options_file="../embeddings/elmo/options_dgx1.json",
                            weight_file=
                            "../embeddings/elmo/elmo_pt_weights_dgx1.hdf5")
                    model = Embedding(**params)
                    measure = get_measure(model, lang, t)
                    yield measure

    elif template == "analogies":
        try:
            open(ANALOGIES_FILE, 'r').close()
        except:
            with open(ANALOGIES_FILE, 'w+') as f:
                json.dump({}, f)
        for path, path2, name, name2, dst, dst2 in get_analogies(
                ANALOGIES_DIR, EMBEDDINGS_DIR):
            key = name.rstrip('.txt') + '_' + path2.split(
                '/')[-1] + '_' + name2.rstrip('.model')
            with open(ANALOGIES_FILE, 'r') as f:
                stats = json.load(ANALOGIES_FILE)
            try:
                stats[key]
                continue
            except:
                pass
            embedding = KeyedVectors.load(dst2)
            score = embedding.evaluate_word_analogies(dst)[0]
            stats[key] = score
            with open(ANALOGIES_FILE, 'w+') as f:
                json.dump(stats, f)
            message = {"key": key, "stats": stats[key]}
            yield message
示例#23
0
def main():
    print("Instantiate embeddings")
    embeddings = DocumentPoolEmbeddings([
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward"),
        ELMoEmbeddings("pubmed"),
        BertEmbeddings("bert-large-uncased"),
    ])

    print("Load pubmed_data")
    pubmed_data = pd.concat([
        pd.read_json(f"data/medline/{medline_file}") for medline_file in
        ["medline_2016.json", "medline_2017.json", "medline_2018.json"]
    ])

    print("pubmed_corpus")
    pubmed_corpus = [
        try_sentence(text) for text in
        pubmed_data.title.apply(preproc).head(10_000)
    ]
    pubmed_corpus = [
        text for text in pubmed_corpus if text
    ]
    print(pubmed_corpus[0:5])

    print("query")
    query = [
        Sentence(text) for text in
        [
            "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization.",
            "Prioritizing putative influential genes in cardiovascular disease susceptibility by applying tissue-specific Mendelian randomization.",
            "Longitudinal analysis strategies for modelling epigenetic trajectories",
            "FATHMM-XF: accurate prediction of pathogenic point mutations via extended features",
            "PhenoSpD: an integrated toolkit for phenotypic correlation estimation and multiple testing correction using GWAS summary statistics.",
            "LD Hub: a centralized database and web interface to perform LD score regression that maximizes the potential of summary level GWAS data for SNP heritability and genetic correlation analysis.",
            "MELODI: Mining Enriched Literature Objects to Derive Intermediates",
            "The MR-Base platform supports systematic causal inference across the human phenome",
        ]
    ]

    print("Embed")
    for text in query + pubmed_corpus:
        embeddings.embed(text)

    print("Calculate scores")
    cos = torch.nn.CosineSimilarity(dim=0, eps=1e-5)
    cos_scores = []
    for query_id, query_text in enumerate(query):
        cos_res = [
            {
                "query_id": query_id,
                "target_id": target_id,
                "score": cos(query_text.embedding,
                             target_text.embedding).item()
            }
            for target_id, target_text in enumerate(pubmed_corpus)
        ]
        cos_scores.append(cos_res)

    cos_scores = pd.concat(pd.DataFrame(x) for x in cos_scores)
    print(cos_scores)

    n = 5
    for query_id, query_text in enumerate(query):
        print(f"# Query {query_id}")
        print(query_text)

        top_n = (
            cos_scores
              .query(f"query_id == {query_id}")
              .sort_values("score", ascending=False)
              .head(n)
        )
        for target_id, target_score in zip(top_n.target_id, top_n.score):
            print(f"  ## Candidate {target_id}, score {target_score}")
            print(f"  {pubmed_corpus[target_id]}\n")
        print("\n\n")
        qdata['sentential_nodes_to_idx'] = sentential_nodes_to_idx
        qdata['doc_neighbors'] = doc_neighbors
        qdata['match_neighbors'] = match_neighbors
        qdata['question_idx'] = set([
            v for k, v in qdata['sentential_nodes_to_idx'].items()
            if k[0] == 'question'
        ])
        for ch in qdata['question']['choices']:
            qdata['choice_%s_idx' % ch['label']] = set([
                v for k, v in qdata['sentential_nodes_to_idx'].items()
                if k[0] == 'choice:%s' % ch['label']
            ])
        del qdata['edges']

    tokenizer = lambda x: nlp.word_tokenize(x)
    embedder = ELMoEmbeddings(embedding_model)
    neighbor_indices = list(
        set(n[0] for qdata in data for n in qdata['sentential_nodes_to_idx']
            if type(n[0]) == int))
    outputs = emb.get_corpus_embeddings(corpus, neighbor_indices, embedder,
                                        tokenizer)
    reverse_neighbor_idx_mapping = {
        v: i
        for i, v in enumerate(neighbor_indices)
    }
    for qdata in tqdm(data):
        qdata['node_embedding_matrix'] = emb.get_question_embedding_matrix(
            qdata, outputs, reverse_neighbor_idx_mapping, embedding_dim)

    store_final_data(output_file_name, qdata)
from preprocessing.load_data import download_and_load_sts_data, download_and_load_sick_dataset
from preprocessing.normalize import normalize
from utility.frequency_loader import load_frequencies, load_doc_frequencies
from utility.run_experiment import run_experiment
import os

if not os.path.exists(IMAGE_PATH):
    os.makedirs(IMAGE_PATH)

sick_all, sick_train, sick_test, sick_dev = download_and_load_sick_dataset()
print('Downloaded data')

frequency = load_frequencies("data/frequencies/frequencies.tsv")
doc_frequency = load_doc_frequencies("data/frequencies/doc_frequencies.tsv")
word2vec = load_word2vec(w2v_path)
elmo = ELMoEmbeddings('large')
bert = TransformerWordEmbeddings('bert-large-cased')
flair = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
])
elmo_bert = StackedEmbeddings([elmo, bert])

print("Loaded Resources")

benchmarks = [("AVG-W2V",
               ft.partial(run_avg_benchmark,
                          model=word2vec,
                          use_stoplist=False)),
              ("AVG-ELMO",
示例#26
0
args = parser.parse_args()

test_df = pd.read_csv(args.inputfile)

# Throw away unwanted rows
test_df['AveragePosition'] = test_df['Average.Position']
test_df.drop('Average.Position', axis=1, inplace=True)
test_df.dropna(inplace=True)
test_df.CTR = test_df.CTR.apply(lambda x: x[:-1]).astype('float64')
test_df = test_df[(test_df.CPC != 0.0) | (test_df.AveragePosition != 0.0) |
                  (test_df.Impressions != 0.0)]

# Compute embeddings
print('Loading ELMo model...', end='')
elmo_small = ELMoEmbeddings('small')
print('Done!')
document_embedding = DocumentPoolEmbeddings([elmo_small])


def compute_elmo_embedding(keyword):
    sentence = Sentence(keyword)
    document_embedding.embed(sentence)
    return sentence.get_embedding().detach().cpu().numpy()


vectors = []
print('\nNow computing embeddings for keywords...', end='')
for keyword in tqdm(test_df.Keyword.values, total=test_df.shape[0]):
    vectors.append(compute_elmo_embedding(keyword))
vectors = pd.DataFrame.from_records(np.array(vectors), index=test_df.index)
    def __init__(self, args, agent_mode):
        # initializes environment variables and then reads sentences.

        print('Initializing the Environment...')
        self.domain = args.domain
        self.dis_dim = args.dis_dim  # 50
        self.tag_dim = args.tag_dim  # 50
        self.word_dim = args.word_dim  # 50
        self.num_words = args.num_words  # 500
        self.action_rate = args.action_rate  # 0.1
        self.use_act_rate = args.use_act_rate  # 1
        # self.use_act_att = args.use_act_att  # 0
        self.reward_base = args.reward_base  # 50.0
        self.ra = args.reward_assign  # [1,2,3]
        self.word2vec = args.word2vec
        self.terminal_flag = False
        self.train_epoch_end_flag = False
        self.valid_epoch_end_flag = False
        self.max_data_char_len = 0
        self.max_data_sent_len = 0
        self.agent_mode = agent_mode  # args.agent_mode
        self.context_len = args.context_len  # 100

        if not args.gui_mode2:
            self.stacked_embeddings = args.stacked_embeddings
        elif args.gui_mode2:  #if gui mode  ..set different embeddings for different networks
            if agent_mode == 'act':
                self.word_dim = self.tag_dim = self.dis_dim = 3172
                self.stacked_embeddings = StackedEmbeddings([
                    WordEmbeddings('glove'),
                    BertEmbeddings('bert-base-uncased')
                ])
            elif agent_mode == 'arg':
                self.word_dim = self.tag_dim = self.dis_dim = 868
                self.stacked_embeddings = StackedEmbeddings(
                    [WordEmbeddings('glove'),
                     ELMoEmbeddings('small')])

        # read the sentences!!!
        if not args.gui_mode:
            if self.agent_mode == 'arg':

                indata = load_pkl('data/refined_%s_data.pkl' % self.domain)[-1]
                arg_sents = []

                for i in tqdm(range(len(indata))):
                    for j in range(len(indata[i])):
                        if len(indata[i][j]) == 0:
                            continue
                        # -1 obj_ind refer to UNK
                        # words = indata[i][j]['last_sent'] + indata[i][j]['this_sent'] + ['UNK'] # we don't need an unknown here.
                        words = indata[i][j]['last_sent'] + indata[i][j][
                            'this_sent']
                        current_sent = indata[i][j]['this_sent']
                        sent_len = len(
                            words)  #here sent len is last_sent + this_sent.
                        act_inds = [
                            a['act_idx'] for a in indata[i][j]['acts']
                            if a['act_idx'] < self.num_words
                        ]  #list of action indexes less than self.num_words = 128
                        for k in range(len(indata[i][j]['acts'])):
                            act_ind = indata[i][j]['acts'][k][
                                'act_idx']  # action index
                            obj_inds = indata[i][j]['acts'][k][
                                'obj_idxs']  # object index list
                            arg_sent = {}

                            # set arg tags
                            arg_tags = np.ones(sent_len,
                                               dtype=np.int32)  # tags

                            if len(obj_inds[1]) == 0:
                                arg_tags[obj_inds[0]] = 2  # essential objects
                            else:
                                arg_tags[obj_inds[0]] = 4  # exclusive objects
                                arg_tags[obj_inds[1]] = 4  # exclusive objects

                            # set distance
                            position = np.zeros(sent_len, dtype=np.int32)
                            position.fill(act_ind)
                            distance = np.abs(np.arange(sent_len) - position)

                            arg_sent['tokens'] = words
                            arg_sent['tags'] = arg_tags
                            arg_sent['act_ind'] = act_ind
                            arg_sent['distance'] = distance
                            arg_sent['act_inds'] = act_inds
                            arg_sent['obj_inds'] = obj_inds

                            # ipdb.set_trace()
                            sent_vec = []

                            if args.stacked_embeddings == 'word2vec':
                                for w in arg_sent['tokens']:
                                    if len(w) > self.max_data_char_len:
                                        self.max_data_char_len = len(w)
                                    if w in self.word2vec.vocab:
                                        sent_vec.append(self.word2vec[w])
                                    else:
                                        sent_vec.append(np.zeros(
                                            self.word_dim))
                            else:
                                # Stacked embeddings
                                line = ' '.join(words)
                                sent = Sentence(line)
                                args.stacked_embeddings.embed(sent)
                                for token in sent:
                                    sent_vec.append(token.embedding.numpy())

                                for w in arg_sent['tokens']:
                                    if len(w) > self.max_data_char_len:
                                        self.max_data_char_len = len(w)

                            sent_vec = np.array(sent_vec)
                            pad_len = self.num_words - len(sent_vec)

                            if len(sent_vec) > self.max_data_sent_len:
                                self.max_data_sent_len = len(sent_vec)

                            distance = np.zeros([self.num_words, self.dis_dim])
                            act_vec = sent_vec[arg_sent[
                                'act_ind']]  # word vector of the input action

                            # TODO: Attention is not required for contextual word embeddings, so commented it out to save time. Try it out if time permits.
                            # attention = np.sum(sent_vec * act_vec, axis=1)  # attention between the input action and its context
                            # attention = np.exp(attention)
                            # attention /= sum(attention)

                            if pad_len > 0:
                                # doc_vec = np.concatenate((doc_vec, np.zeros([pad_len, self.word_dim])))  # doc_vec.shape = [5oo, 5o]
                                # act_text['tags'] = np.concatenate((np.array(act_text['tags']), np.ones(pad_len, dtype=np.int32)))  # [500]
                                sent_vec = np.concatenate(
                                    (sent_vec,
                                     np.zeros([pad_len, self.word_dim])))  #
                                arg_sent['tags'] = np.concatenate(
                                    (np.array(arg_sent['tags']),
                                     np.ones(pad_len, dtype=np.int32)))
                                # attention = np.concatenate((attention, np.zeros(pad_len)))
                                for d in range(len(arg_sent['distance'])):
                                    distance[d] = arg_sent['distance'][d]
                            else:
                                sent_vec = sent_vec[:self.num_words]
                                arg_sent['tokens'] = arg_sent[
                                    'tokens'][:self.num_words]
                                arg_sent['tags'] = np.array(
                                    arg_sent['tags'])[:self.num_words]
                                # attention = attention[: self.num_words]
                                for d in range(self.num_words):
                                    distance[d] = arg_sent['distance'][d]

                            # TODO: Future work: Use attention
                            # if self.use_act_att:  # apply attention to word embedding
                            #     sent_vec = attention.reshape(-1, 1) * sent_vec

                            sent_vec = np.concatenate((sent_vec, distance),
                                                      axis=1)

                            arg_sent['sent_vec'] = sent_vec
                            arg_sent['tags'].shape = (self.num_words, 1)
                            # self.create_matrix(arg_sent,words) #create_matrix function
                            arg_sents.append(arg_sent)
                '''
                Split into train and test first. 
                Split train into train and val then.
                '''
                self.train_data, self.test_data = train_test_split(
                    arg_sents, test_size=0.2, random_state=1)
                self.train_data, self.validation_data = train_test_split(
                    self.train_data, test_size=0.2, random_state=1)

                self.train_steps = len(self.train_data) * self.num_words
                self.validation_steps = len(
                    self.validation_data) * self.num_words
                self.test_steps = len(self.test_data) * self.num_words

                self.num_train = len(self.train_data)
                self.num_validation = len(self.validation_data)
                self.num_test = len(self.test_data)

                print('\n\ntraining texts: %d\tvalidation texts: %d' %
                      (len(self.train_data), len(self.validation_data)))
                print('max_data_sent_len: %d\tmax_data_char_len: %d' %
                      (self.max_data_sent_len, self.max_data_char_len))
                print('self.train_steps: %d\tself.valid_steps: %d\n\n' %
                      (self.train_steps, self.validation_steps))

                print('\n\ntest texts: %d\t self.test_steps:%d\n' %
                      (len(self.test_data), self.test_steps))
            else:  #actions
                # self.read_act_texts()

                # read action texts into input_data

                input_data = load_pkl('data/%s_labeled_text_data.pkl' %
                                      self.domain)

                # unroll the stuff inside and store it in a list called act_texts
                act_texts = []
                for i in range(
                        len(input_data
                            )):  #until length of training examples (documents)
                    if len(
                            input_data[i]
                        ['words']) == 0:  #if there are no words in a document
                        continue
                    # act_text is a dictionary to store info.
                    act_text = {}
                    act_text['tokens'] = input_data[i][
                        'words']  #tokens = individual words
                    act_text['sents'] = input_data[i][
                        'sents']  #sents = sentences [['a ','cat ', 'runs.'], [ ], ...]
                    act_text['acts'] = input_data[i][
                        'acts']  #acts = [{},{},{}, ..] where {} = 4 tuple containing keys: [act_idx, obj_idxs, act_type, related_acts]
                    act_text['sent_acts'] = input_data[i][
                        'sent_acts']  #list of acts in a sentence for every sentence.
                    act_text['word2sent'] = input_data[i][
                        'word2sent']  # {0:0, 1:0, 2:0, .... 38:2....} Mapping of word_index to sentence_index
                    act_text['tags'] = np.ones(
                        len(input_data[i]['words']), dtype=np.int32
                    )  #same length as number of words in a document.
                    act_text['act2related'] = {}  #related actions

                    #for all action 4 tuples
                    for acts in input_data[i]['acts']:
                        act_text['act2related'][acts['act_idx']] = acts[
                            'related_acts']  # act_text['act2related'] = {act_idx: []} where [] is list of related actions
                        act_text['tags'][acts['act_idx']] = acts[
                            'act_type'] + 1  # TODO: 2, 3, 4? - why? act_text['tags'] = [2,3,4,2,2,3,3,4,4,...] where index of array is action_index

                    # self.create_matrix(act_text)
                    # Creating matrix
                    doc_vec = []

                    if args.stacked_embeddings != 'word2vec':
                        # doing Flair embeddings
                        for sent in tqdm(act_text['sents']):
                            line = ' '.join(sent)
                            sentence = Sentence(line)
                            args.stacked_embeddings.embed(sentence)
                            for token in sentence:
                                # print(token.embedding.shape)  # 4196

                                doc_vec.append(token.embedding.numpy())

                        #initialize word2vec or zeroes
                        for word in act_text['tokens']:
                            if len(word) > self.max_data_char_len:
                                self.max_data_char_len = len(
                                    word
                                )  #max_data_char_len shows longest word.
                            # if word in self.word2vec.vocab:
                            #     doc_vec.append(self.word2vec[word])
                            # else:
                            #     doc_vec.append(np.zeros(self.word_dim))

                    elif args.stacked_embeddings == 'word2vec':
                        # initialize word2vec or zeroes
                        for word in act_text['tokens']:
                            if len(word) > self.max_data_char_len:
                                self.max_data_char_len = len(
                                    word
                                )  # max_data_char_len shows longest word.
                            if word in self.word2vec.vocab:
                                doc_vec.append(self.word2vec[word])
                            else:
                                doc_vec.append(np.zeros(self.word_dim))

                    doc_vec = np.array(doc_vec)
                    pad_len = self.num_words - len(doc_vec)
                    if len(doc_vec) > self.max_data_sent_len:
                        self.max_data_sent_len = len(
                            doc_vec
                        )  #max_data_sent_len is length of longest document vector..

                    # print(doc_vec.shape)

                    if pad_len > 0:  #if not negative.
                        doc_vec = np.concatenate(
                            (doc_vec,
                             np.zeros([pad_len, self.word_dim
                                       ])))  # doc_vec.shape = [5oo, 5o]
                        act_text['tags'] = np.concatenate(
                            (np.array(act_text['tags']),
                             np.ones(pad_len, dtype=np.int32)))  # [500]
                    else:  #pad_len is negative
                        doc_vec = doc_vec[:self.num_words]  #pick first 500
                        act_text['tokens'] = act_text[
                            'tokens'][:self.
                                      num_words]  #also in tokens, first 500
                        act_text['tags'] = np.array(
                            act_text['tags']
                        )[:self.num_words]  #also in tags, first 500

                    act_text[
                        'sent_vec'] = doc_vec  # set sentence vec to 500,50 doc_vec
                    act_text['tags'].shape = (self.num_words, 1
                                              )  # redefine shape to 500,1

                    act_texts.append(
                        act_text)  #keep collecting documents in act_texts
                '''
                Split into train and test first. 
                Split train into train and val then.
                '''
                # seed makes sure dataset is always split in the same way randomly
                self.train_data, self.test_data = train_test_split(
                    act_texts, test_size=0.2, random_state=1)
                self.train_data, self.validation_data = train_test_split(
                    self.train_data, test_size=0.2, random_state=1)

                self.train_steps = len(
                    self.train_data
                ) * self.num_words  # length of train data * 500
                self.validation_steps = len(
                    self.validation_data
                ) * self.num_words  #length of validation data * 500 -- Why a step includes multiplication with num_words?  because each training and val example contains 500 words.
                self.test_steps = len(self.test_data) * self.num_words

                self.num_train = len(self.train_data)
                self.num_validation = len(self.validation_data)
                self.num_test = len(self.test_data)

                print('\n\ntraining texts: %d\tvalidation texts: %d' %
                      (len(self.train_data), len(self.validation_data)))
                print('max_data_sent_len: %d\tmax_data_char_len: %d' %
                      (self.max_data_sent_len,
                       self.max_data_char_len))  #sent len means doc len
                print('self.train_steps: %d\tself.valid_steps: %d\n\n' %
                      (self.train_steps, self.validation_steps))

                print('\n\ntest texts: %d\t self.test_steps:%d\n' %
                      (len(self.test_data), self.test_steps))

            args.train_steps = self.train_steps
            args.valid_steps = self.validation_steps  # validation steps
            args.test_steps = self.test_steps
示例#28
0
def load_elmo_embeddings(ename):
    return DocumentPoolEmbeddings([ELMoEmbeddings(ename)])
示例#29
0
corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file="train.txt",
                              dev_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased')
                     ]),
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased'),
                         CharacterEmbeddings()
                     ])
                 ])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
示例#30
0
class Database(object):
    def __init__(self, docs):
        #self.documents_orig = np.loadtxt(docs, delimiter='\n', dtype = str)   # only 9999 documents
        self.documents_orig = []
        with open(docs, 'r') as f:  # getting 10k docs using this
            self.documents_orig = f.readlines()

        self.documents = []
        self.elmo = ELMoEmbeddings()
        #self.embedding = DocumentPoolEmbeddings([self.elmo])
        self.debug = True

    def knn(self, query, query_txt, k):
        #cos_sim = torch.mm(self.documents, query) / (torch.norm(query) * torch.norm(self.documents))

        cos_sim = torch.nn.functional.cosine_similarity(self.documents, query)

        topk, topk_indices = torch.topk(cos_sim, k, 0, True)

        topk_indices = topk_indices.numpy().astype('int')
        topk = topk.numpy().astype('float')
        top_combined = np.vstack((topk, topk_indices)).T

        if self.debug:
            print("\n")
            print("Query: ", query_txt, " | index: ", topk_indices.T)
            [
                print(self.documents_orig[int(i[1])], " --- ", i[0])
                for i in top_combined
            ]

        return list(zip(topk, topk_indices))  #used to return tuples

    def load_documents_into_embedding(self):
        print("Embedding ", len(self.documents_orig), " Documents")
        #self.documents_orig = self.documents_orig[0:50]
        self.documents = [
            self.elmo.embed(Sentence(elem)) for elem in self.documents_orig
        ]

        self.documents = torch.stack([
            torch.cat([token.embedding.unsqueeze(0) for token in elem[0]],
                      dim=0)[0] for elem in self.documents
        ])

        np.save("./documents_embedded.npy", self.documents)

    def run_query(self, query, k=None):
        """Run a query on the given documents based on word embeddings
        
        Arguments:
            query {str} -- Query string.
        
        Keyword Arguments:
            k {int} -- The top documents to return (default: 10)
        
        Returns:
            list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id.
                Made up example to show the formatting with k=5:
                        [(0.89316645860672, 1567), 
                        (0.6174346804618835, 125), 
                        (0.5975501537321234, 1181), 
                        (0.5779426293373108, 3979), 
                        (0.5110726475715637, 7155)]
        """
        if k is None:
            k = 10

        sentence = Sentence(query)

        #self.embedding.embed(sentence)

        self.elmo.embed(sentence)

        sentence = [token.embedding.unsqueeze(0) for token in sentence][0]

        #print(sentence)

        # A returned list should look like this for k=5. Btw. the numbers are made up!

        #[
        #            (0.89316645860672, 1567),
        #            (0.6174346804618835, 125),
        #            (0.5975501537321234, 1181),
        #            (0.5779426293373108, 3979),
        #            (0.5110726475715637, 7155),
        #        ]

        return self.knn(sentence, query, k=k)

    def run_query_txt(self, text):
        self.queries = np.loadtxt(text, delimiter='\n', dtype=str)

        results = []

        for query in self.queries:
            out = self.run_query(query)
            results.append(out)

        #saving results

        file = open("results.txt", 'w')

        for elem in results:
            out = ""
            for res in elem:
                out += str(res[0]) + "," + str(res[1]) + ";"
            out = out[:-1]
            out += '\n'
            file.write(out)

        file.close()