Python Sense2VecComponent示例，sense2vec.Sense2VecComponent Python示例

示例#1

0

显示文件

文件： test_component.py 项目： zzmjohn/sense2vec

def test_component_similarity(doc):
    s2v = Sense2VecComponent(doc.vocab, shape=(4, 4))
    s2v.first_run = False
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("hello|INTJ", vector)
    s2v.s2v.add("world|NOUN", vector)
    doc = s2v(doc)
    assert doc[0]._.s2v_similarity(doc[1]) == 1.0
    assert doc[1:3]._.s2v_similarity(doc[1:3]) == 1.0

示例#2

0

显示文件

文件： test_component.py 项目： zzmjohn/sense2vec

def test_component_to_from_bytes(doc):
    s2v = Sense2VecComponent(doc.vocab, shape=(1, 4))
    s2v.first_run = False
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("world|NOUN", vector)
    assert "world|NOUN" in s2v.s2v
    assert "world|GPE" not in s2v.s2v
    doc = s2v(doc)
    assert doc[0]._.in_s2v is False
    assert doc[1]._.in_s2v is True
    s2v_bytes = s2v.to_bytes()
    new_s2v = Sense2VecComponent(doc.vocab).from_bytes(s2v_bytes)
    new_s2v.first_run = False
    assert "world|NOUN" in new_s2v.s2v
    assert numpy.array_equal(new_s2v.s2v["world|NOUN"], vector)
    assert "world|GPE" not in new_s2v.s2v
    new_s2v.s2v.vectors.resize((2, 4))
    new_s2v.s2v.add("hello|INTJ", vector)
    assert doc[0]._.in_s2v is False
    new_doc = new_s2v(doc)
    assert new_doc[0]._.in_s2v is True

示例#3

0

显示文件

文件： test_component.py 项目： zzmjohn/sense2vec

def test_component_lemmatize(doc):
    lookups = doc.vocab.lookups.add_table("lemma_lookup")
    lookups["world"] = "wrld"
    s2v = Sense2VecComponent(doc.vocab, shape=(4, 4), lemmatize=True)
    s2v.first_run = False
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("hello|INTJ", vector)
    s2v.s2v.add("world|NOUN", vector)
    s2v.s2v.add("wrld|NOUN", vector)
    doc = s2v(doc)
    assert doc[0]._.s2v_key == "hello|INTJ"
    assert doc[1].lemma_ == "wrld"
    assert doc[1]._.s2v_key == "wrld|NOUN"
    lookups["hello"] = "hll"
    assert doc[0].lemma_ == "hll"
    assert doc[0]._.s2v_key == "hello|INTJ"
    s2v.s2v.add("hll|INTJ", vector)
    assert doc[0]._.s2v_key == "hll|INTJ"
    new_s2v = Sense2VecComponent().from_bytes(s2v.to_bytes())
    assert new_s2v.s2v.cfg["lemmatize"] is True
    doc.vocab.lookups.remove_table("lemma_lookup")

示例#4

0

显示文件

文件： test_component.py 项目： zzmjohn/sense2vec

def test_component_attributes(doc):
    s2v = Sense2VecComponent(doc.vocab, shape=(10, 4))
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("world|NOUN", vector, 123)
    doc = s2v(doc)
    assert doc[0]._.s2v_key == "hello|INTJ"
    assert doc[1]._.s2v_key == "world|NOUN"
    assert doc[0]._.in_s2v is False
    assert doc[1]._.in_s2v is True
    assert doc[0]._.s2v_freq is None
    assert doc[1]._.s2v_freq == 123
    assert numpy.array_equal(doc[1]._.s2v_vec, vector)

示例#5

0

显示文件

文件： test_component.py 项目： zzmjohn/sense2vec

def test_component_attributes_ents(doc):
    s2v = Sense2VecComponent(doc.vocab, shape=(10, 4))
    s2v.first_run = False
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("world|NOUN", vector)
    s2v.s2v.add("world|GPE", vector)
    doc = s2v(doc)
    assert len(doc._.s2v_phrases) == 0
    doc.ents = [Span(doc, 1, 2, label="GPE")]
    assert len(doc._.s2v_phrases) == 1
    phrase = doc._.s2v_phrases[0]
    assert phrase._.s2v_key == "world|GPE"
    assert phrase[0]._.s2v_key == "world|NOUN"
    assert phrase._.in_s2v is True
    assert phrase[0]._.in_s2v is True

示例#6

0

显示文件

文件： test_component.py 项目： svlandeg/sense2vec

def test_component_lemmatize(doc):
    def lemmatize(doc, lookups):
        for token in doc:
            token.lemma_ = lookups.get(token.text, token.text)
        return doc

    s2v = Sense2VecComponent(doc.vocab, shape=(4, 4), lemmatize=True)
    s2v.first_run = False
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.s2v.add("hello|INTJ", vector)
    s2v.s2v.add("world|NOUN", vector)
    s2v.s2v.add("wrld|NOUN", vector)
    doc = lemmatize(doc, {"world": "wrld"})
    doc = s2v(doc)
    assert doc[0]._.s2v_key == "hello|INTJ"
    assert doc[1].lemma_ == "wrld"
    assert doc[1]._.s2v_key == "wrld|NOUN"
    doc = lemmatize(doc, {"hello": "hll"})
    assert doc[0].lemma_ == "hll"
    assert doc[0]._.s2v_key == "hello|INTJ"
    s2v.s2v.add("hll|INTJ", vector)
    assert doc[0]._.s2v_key == "hll|INTJ"
    new_s2v = Sense2VecComponent().from_bytes(s2v.to_bytes())
    assert new_s2v.s2v.cfg["lemmatize"] is True

示例#7

0

显示文件

文件： QuestionProcessor.py 项目： CodenameLuxu/TrainChatBot

    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")
        s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')
        self.nlp.add_pipe(s2v)
        self.elements = {}
        with open('ai_chatbot/scripts/QuestionDomain/csv/TAG_ELEMENT.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                self.elements[row[0]] = row[1:-1]

        self.ordinal = {}
        with open('ai_chatbot/scripts/QuestionDomain/csv/TAG_ORDINAL.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                self.ordinal[row[0]] = int(row[1])

示例#8

0

显示文件

文件： SpacyVectorSimilarityTest.py 项目： babyfacebrian/Fake-News-Analysis

import spacy
import numpy as np
import sense2vec
from sense2vec import Sense2VecComponent

nlp = spacy.load('en_core_web_lg')

s2v = Sense2VecComponent()

nlp.add_pipe(s2v)

doc = nlp("A sentence about natural language processing.")

most_similar = doc[3:6]._.s2v_most_similar(3)

print(most_similar)

示例#9

0

显示文件

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span
from sense2vec import Sense2VecComponent

nlp = spacy.load("en_core_web_lg")
s2v = Sense2VecComponent(nlp.vocab).from_disk("C:/fyp/s2v_reddit_2019_lg")
nlp.add_pipe(s2v)
with open("api/v0/v01/input_list/input_list.txt", "r", encoding="utf-8") as f:
    TEXT = f.read()
doc = nlp(TEXT)


def list_of_files(doc):
    #find all first letters in the doc and create only unique a list of unique text file names to use
    text_files = list()
    for token in doc:
        if token.text.isalpha():
            text_files.append("text" + (str(token.text[0])).lower() + ".txt")
    return set(text_files)


def find_all_begining_with(doc):
    #take your entire vocab and find all words in it that begin with the same letter that each of the text_files (created in list_of_files) end with
    # ..and create each text file and populate it with its own vocab
    with open("api/v0/v01/vocab/vocab.txt", "r", encoding="utf-8") as f:
        vocab_list = list(f.readlines())
        #vocab_list =["ant", "apple", "arch", "arm", "army", "baby", "bag", "ball", "band", "basin", "basket", "bath", "bed", "bee", "bell", "berry", "bird", "blade", "board", "boat", "bone", "book", "boot", "bottle", "box", "boy", "brain", "brake", "branch", "brick", "bridge", "brush", "bucket", "bulb", "button", "cake", "camera","card", "cart", "carriage", "cat", "chain", "cheese", "chest", "chin", "church", "circle", "clock", "cloud", "coat", "collar", "comb", "cord", "cow", "cup", "curtain", "cushion", "dog", "door", "drain", "drawer", "dress", "drop", "ear", "egg", "engine", "eye", "face", "farm", "feather", "finger", "fish", "flag", "floor", "fly", "foot", "fork", "fowl", "frame", "garden", "girl", "glove", "goat", "gun", "hair", "hammer", "hand", "hat", "head", "heart", "hook", "horn", "horse", "hospital", "house", "island", "jewel", "kettle", "key", "knee", "knife", "knot", "leaf", "leg", "library", "line", "lip", "lock", "map", "match", "monkey", "moon", "mouth", "muscle", "nail", "neck", "needle", "nerve", "net", "nose", "nut", "office", "orange", "oven", "parcel", "pen", "pencil", "picture", "pig", "pin", "pipe", "plane", "plate", "plough", "pocket", "pot", "potato", "prison", "pump", "rail", "rat", "receipt", "ring", "rod", "roof", "root", "sail", "school", "scissors", "screw", "seed", "sheep", "shelf", "ship", "shoe", "skin", "snake", "sock", "spade", "sponge", "spoon", "spring", "square", "stamp", "star", "station", "stem", "stick", "stocking", "stomach", "store", "street", "sun", "table", "tail", "thread", "throat", "thumb", "ticket", "toe", "tongue", "tooth", "town", "train", "tray", "tree", "trousers", "umbrella", "wall", "watch", "wheel", "whip", "whistle", "window", "wing", "wire", "worm"]
        text_files = list_of_files(doc)
        for file in text_files:
            with open(("api/v0/v01/textfiles/" + file), "w+",

示例#10

0

显示文件

    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

#Loading Word2Vec Embeddings

model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)

#Loading Sense Embeddings

nlp = en_core_web_lg.load()
s2v = Sense2VecComponent(nlp.vocab).from_disk("/content/drive/My Drive/s2v_reddit_2019_lg")
nlp.add_pipe(s2v)

#Load training data

df = pd.read_csv('train_tsv.tsv', sep='\t', names=["is_duplicate", "question1", "question2", "id"])
df = df.set_index('id')
df = df[(df['question1'].isna() == False) & (df['question2'].isna() == False)]

training_data = list(df['question1'])+list(df['question2'])

training_data = [preprocessing_pipeline(i) for i in training_data]
tfidf_vectorizer = tfidf(training_data)
vectorizer_bow = bag_of_words(training_data, 0)
vectorizer_ngram = bag_of_words(training_data, 1)
vectorizer_3gram = trigram(training_data)

示例#11

0

显示文件

    noun_chunks_df.loc[i, 'root'] = chunk.root,
    noun_chunks_df.loc[i, 'root.text'] = chunk.root.text,
    noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_
    noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text

print(noun_chunks_df[:20])

nlp = spacy.load('en_core_web_sm', disable_pipes=["tagger", "ner"])
train_df['parsed'] = train_df.Text[49500:50500].apply(nlp)
corpus = st.CorpusFromParsedDocuments(train_df[49500:50500],
                                      category_col='Score',
                                      parsed_col='parsed').build()

from sense2vec.vectors import VectorMap

s2v = Sense2VecComponent('data/reddit_vectors-1.1.0/reddit_vectors-1.1.0')
spacy_tok.add_pipe(s2v)
doc = spacy_tok(u"dessert.")
freq = doc[0]._.s2v_freq
vector = doc[0]._.s2v_vec
most_similar = doc[0]._.s2v_most_similar(5)
print(most_similar, freq)

doc = spacy_tok(u"burger")
most_similar = doc[0]._.s2v_most_similar(4)
print(most_similar)

train_df['tuples'] = train_df.apply(lambda row: (row['Text'], row['Score']),
                                    axis=1)
train = train_df['tuples'].tolist()
print(train[:1])

示例#12

0

显示文件

文件： nlp_pipeline.py 项目： relax94/dltemplate

    def tokenize(self):
        if self.texts is None:
            if not self.context:
                # Read in text data from text_file path
                self.texts = open(self.text_file).read().split('\n')
                self.texts = [str(t) for t in self.texts]
                print('Made texts')
            else:
                filename, file_ext = os.path.splitext(self.text_file)
                if file_ext == '.json':
                    # Read in json data as dataframe
                    # noinspection PyUnresolvedReferences
                    df = pd.read_json(self.text_file, lines=True)
                else:
                    # Read in tabular data as dataframe
                    # noinspection PyUnresolvedReferences
                    df = pd.read_csv(self.text_file,
                                     sep=self.sep,
                                     usecols=self.use_cols)

                # Extract the text
                text_col_name = self.use_cols[0]
                self.texts = df[text_col_name].values.astype(str).tolist()

                # Small memory reduction by deleting this
                del df[text_col_name]
                self.context_df = df

        # Get number of documents supplied
        self.n_docs = len(self.texts)

        # Init data as a bunch of zeros - shape [n_docs, max_len]
        self.data = np.zeros((self.n_docs, self.max_len), dtype=np.uint64)

        if not self.tokenizing_new:
            # Add the skip token to the vocab, creating a unique hash for it
            self.nlp.vocab.strings.add(self.skip_token)
            self.skip_token = self.nlp.vocab.strings[self.skip_token]

        self.data[:] = self.skip_token

        # Make array to store row numbers of documents that must be deleted
        self.purged_docs = []

        # This array will hold tokenized text data if it is asked for
        if self.save_tokenized_text_data:
            self.text_data = []

        if self.tokenize_sents:
            self.sentence_tokenize()
            return

        # If we want to merge phrases, we add s2v component
        # to our pipe and it will do it for us.
        if self.merge:
            s2v = Sense2VecComponent('reddit_vectors-1.1.0')
            self.nlp.add_pip(s2v)

        for i, doc in enumerate(
                self.nlp.pipe(self.texts,
                              n_threads=self.n_threads,
                              batch_size=10000)):
            # noinspection PyBroadException
            try:
                # Create temp list for holding doc text
                if self.save_tokenized_text_data:
                    doc_text = []

                for token in doc:
                    # TODO - determine if you want to leave spaces or replace with underscores
                    # Replaces spaces between phrases with underscore
                    # text = token.text.replace(" ", "_")
                    # Get the string token for the given token type
                    if self.token_type == 'lower':
                        _token = token.lower_
                    elif self.token_type == 'lemma':
                        _token = token.lemma_
                    else:
                        _token = token.orth_

                    # Add token to spacy string list so we can use oov as known hash tokens
                    if token.is_oov:
                        self.nlp.vocab.strings.add(_token)

                    if self.save_tokenized_text_data:
                        doc_text.append(_token)

                if self.save_tokenized_text_data:
                    self.text_data.append(doc_text)

                # Options for how to tokenize
                if self.token_type == 'lower':
                    dat = doc.to_array([
                        LOWER, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])
                elif self.token_type == 'lemma':
                    dat = doc.to_array([
                        LEMMA, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])
                else:
                    dat = doc.to_array([
                        ORTH, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])

                if len(dat) > 0:
                    assert dat.min(
                    ) >= 0, 'Negative indices reserved for special tokens'
                    if self.skip_oov:
                        # Get indices of email, URL and oov tokens
                        idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) | (dat[:, 3] >
                                                                   0)
                    else:
                        # Get indices of email and URL tokens
                        idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)

                    # Replace email and URL tokens with skip token
                    dat[idx] = self.skip_token

                    # Delete punctuation
                    if self.delete_punc:
                        delete = np.where(dat[:, 4] == 1)
                        dat = np.delete(dat, delete, 0)

                    if self.only_keep_alpha:
                        delete = np.where(dat[:, 5] == 0)
                        dat = np.delete(dat, delete, 0)

                    length = min(len(dat), self.max_len)
                    self.data[i, :length] = dat[:length, 0].ravel()

            except Exception:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print('\n\n')
                print(exc_type, filename, exc_tb.tb_lineno)
                self.purged_docs.append(i)
                continue

        # If necessary, delete documents that failed to tokenize correctly.
        self.data = np.delete(self.data, self.purged_docs, 0).astype(np.uint64)

        # Unique tokens
        self.uniques = np.unique(self.data)

        # Saved Spacy Vocab
        self.vocab = self.nlp.vocab

        # Making an idx to word mapping for vocab
        self.hash_to_word = {}

        # Insert padding id into the hash
        self.hash_to_word[self.skip_token] = '<SKIP>'

        # If lemma, insert pronoun ID into the hash
        if self.token_type == 'lemma':
            self.hash_to_word[self.nlp.vocab.strings['-PRON-']] = '-PRON-'

        for v in self.uniques:
            if v != self.skip_token:
                # noinspection PyPep8,PyBroadException
                try:
                    if self.token_type == 'lower':
                        self.hash_to_word[v] = self.nlp.vocab[v].lower_
                    elif self.token_type == 'lemma':
                        self.hash_to_word[v] = self.nlp.vocab[v].lemma_
                    else:
                        self.hash_to_word[v] = self.nlp.vocab[v].orth_
                except Exception:
                    pass

示例#13

0

显示文件

文件： nlppipe.py 项目： xspring14/Lda2vec-Tensorflow

    def tokenize(self):
        # This is here in case we want to tokenize more documents later
        if self.texts == None:
            if self.context == False:
                # Read in text data from textfile path
                self.texts = open(self.textfile).read().split('\n')
                self.texts = [str(t) for t in self.texts]
                print("made texts")
            else:
                filename, file_extension = os.path.splitext(self.textfile)
                if file_extension == ".json":
                    # Read in json data as dataframe
                    df = pd.read_json(self.textfile, lines=True)
                else:
                    # Read in data as dataframe
                    df = pd.read_csv(self.textfile,
                                     sep=self.sep,
                                     usecols=self.usecols)
                # Extract the text
                text_col_name = self.usecols[0]
                self.texts = df[text_col_name].values.astype(str).tolist()
                # Small memory reduction by deleting this
                del df[text_col_name]
                self.context_df = df

        # Get number of documents supplied
        self.num_docs = len(self.texts)

        # Init data as a bunch of zeros - shape [num_texts, max_length]
        self.data = np.zeros((len(self.texts), self.max_length),
                             dtype=np.uint64)

        if not self.tokenizing_new:
            # Add the skip token to the vocab, creating a unique hash for it
            self.nlp.vocab.strings.add(self.skip)
            self.skip = self.nlp.vocab.strings[self.skip]
        self.data[:] = self.skip
        # Make array to store row numbers of documents that must be deleted
        self.purged_docs = []

        # This array will hold tokenized text data if it is asked for
        if self.save_tokenized_text_data:
            self.text_data = []

        if self.tokenize_sentences:
            self.sentence_tokenize()
            return

        # If we want to merge phrases, we add s2v component
        # to our pipe and it will do it for us.
        if self.merge:
            s2v = Sense2VecComponent('reddit_vectors-1.1.0')
            self.nlp.add_pipe(s2v)

        for row, doc in enumerate(
                self.nlp.pipe(self.texts,
                              n_threads=self.num_threads,
                              batch_size=10000)):
            try:
                # Create temp list for holding doc text
                if self.save_tokenized_text_data:
                    doc_text = []

                # Loop through tokens in doc
                for token in doc:
                    # TODO - determine if you want to leave spaces or replace with underscores
                    # Replaces spaces between phrases with underscore
                    # text = token.text.replace(" ", "_")
                    # Get the string token for the given token type
                    if self.token_type == "lower":
                        _token = token.lower_
                    elif self.token_type == "lemma":
                        _token = token.lemma_
                    else:
                        _token = token.orth_

                    # Add token to spacy string list so we can use oov as known hash tokens
                    if token.is_oov:
                        self.nlp.vocab.strings.add(_token)

                    if self.save_tokenized_text_data:
                        doc_text.append(_token)

                if self.save_tokenized_text_data:
                    self.text_data.append(doc_text)

                # Options for how to tokenize
                if self.token_type == "lower":
                    dat = doc.to_array([
                        LOWER, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])
                elif self.token_type == "lemma":
                    dat = doc.to_array([
                        LEMMA, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])
                else:
                    dat = doc.to_array([
                        ORTH, LIKE_EMAIL, LIKE_URL, IS_OOV, IS_PUNCT, IS_ALPHA
                    ])

                if len(dat) > 0:
                    msg = "Negative indices reserved for special tokens"
                    assert dat.min() >= 0, msg
                    if self.skip_oov:
                        # Get Indexes of email and URL and oov tokens
                        idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) | (dat[:, 3] >
                                                                   0)
                    else:
                        # Get Indexes of email and URL tokens
                        idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
                        # Replace email and URL tokens with skip token
                    dat[idx] = self.skip
                    # Delete punctuation
                    if self.delete_punctuation:
                        delete = np.where(dat[:, 4] == 1)
                        dat = np.delete(dat, delete, 0)
                    if self.only_keep_alpha == True:
                        delete = np.where(dat[:, 5] == 0)
                        dat = np.delete(dat, delete, 0)
                    length = min(len(dat), self.max_length)
                    self.data[row, :length] = dat[:length, 0].ravel()
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print("\n\n")
                print(exc_type, fname, exc_tb.tb_lineno)
                # print("Warning! Document", row, "broke, likely due to spaCy merge issues.\nMore info at their github, issues #1547 and #1474")
                self.purged_docs.append(row)
                continue

        # If necessary, delete documents that failed to tokenize correctly.
        self.data = np.delete(self.data, self.purged_docs, 0).astype(np.uint64)
        # Unique tokens
        self.uniques = np.unique(self.data)
        # Saved Spacy Vocab
        self.vocab = self.nlp.vocab
        # Making an idx to word mapping for vocab
        self.hash_to_word = {}
        # Manually putting in this hash for the padding ID
        self.hash_to_word[self.skip] = '<SKIP>'
        # If lemma, manually put in hash for the pronoun ID
        if self.token_type == "lemma":
            self.hash_to_word[self.nlp.vocab.strings["-PRON-"]] = "-PRON-"

        for v in self.uniques:
            if v != self.skip:
                try:
                    if self.token_type == "lower":
                        self.hash_to_word[v] = self.nlp.vocab[v].lower_
                    elif self.token_type == "lemma":
                        self.hash_to_word[v] = self.nlp.vocab[v].lemma_
                    else:
                        self.hash_to_word[v] = self.nlp.vocab[v].orth_
                except:
                    pass

示例#14

0

显示文件

import nltk, spacy
from nltk.tokenize import sent_tokenize
from sense2vec import Sense2VecComponent
nltk.download('punkt')
# Resources for determining similarity: Spacy, sense2vec
s2v_path = "D:\\Programs\\Python37x64\\nlp_config\\s2v_reddit_2015_md"
spacy_lg_path = 'D:\\Programs\\Python37x64\\nlp_config\\venv\\Lib\\site-packages\\en_core_web_lg\\en_core_web_lg-2.2.5'
nlp = spacy.load(spacy_lg_path)
s2v = Sense2VecComponent(nlp.vocab).from_disk(s2v_path)
nlp.add_pipe(s2v)


seeds = {}
seeds['food']="food drink"
seeds['atms']= "atmosphere place environment"
seeds['serv']= "server management time"
seeds['prce']= "money expensive"


def avg(my_list):
    if len(my_list) == 0:
        return 0
    return sum(my_list) / len(my_list)


def get_noun_toks(doc):
    return [tok for tok in doc if tok.tag_.startswith('N')]


def calculate_similarity(noun, seed):
    if not seed.has_vector or not noun.has_vector:

示例#15

0

显示文件

文件： GenerateRequirements.py 项目： RELabUU/Requirements-Synthesizer

    predargs = True

#use standard python randomizer
import random

#glob needed to find files
import glob

#nltk
import nltk

#sense2vec in combination with Spacy
import spacy
from sense2vec import Sense2VecComponent
spacynlp = spacy.load('en')
s2v = Sense2VecComponent('C:/Python27/ReqAnalyzing/reddit_vectors-1.1.0')
spacynlp.add_pipe(s2v)

#make deepcopy available
import copy

#grammar check
import grammar_check
tool = grammar_check.LanguageTool('en-GB')
'''
Construct requirements starts here

Proposed order.
1. Pick verb from allparts list
2. Check which frames/arguments are needed with verbnet
3. Find a semantically matching argument pair in allparts files

示例#16

0

显示文件

 def __init__(self, model_name, sense2vec_path):
     self.nlp = spacy.load(model_name)
     s2v = Sense2VecComponent(self.nlp.vocab).from_disk(sense2vec_path)
     self.nlp.add(s2v)

示例#17

0

显示文件

import spacy
from dataclasses import dataclass
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, root_validator
from sense2vec import Sense2VecComponent
from starlette.responses import Response
from starlette.status import HTTP_204_NO_CONTENT

app: FastAPI = FastAPI()
model: str = os.getenv('SPACY_MODEL')
pipeline_error: str = f"The model ({model}) doesn't support " + '{}.'
nlp: spacy = spacy.load(model)
if os.getenv('SENSE2VEC') == '1':
    nlp.add_pipe(
        Sense2VecComponent(nlp.vocab).from_disk('src/s2v_old')
    )


def enforce_components(components: List[str], message: str) -> None:
    """Throws the <message> if the model doesn't have the <components>."""
    for component in components:
        if not nlp.has_pipe(component):
            raise HTTPException(
                status_code=400,
                detail=pipeline_error.format(message)
            )


class NERRequest(BaseModel):
    sections: List[str]

示例#18

0

显示文件

from pipelines import pipeline
from text2text.text_generator import TextGenerator
import nltk
from nltk.stem.porter import *
import spacy
from sense2vec import Sense2VecComponent

spacy_nlp = spacy.load("en_core_web_sm")
s2v = Sense2VecComponent(spacy_nlp.vocab).from_disk("./s2v_old")
spacy_nlp.add_pipe(s2v)

t5_generator = pipeline("question-generation")
t2t_generator = TextGenerator(output_type="question")


def generate_from_T5(context, n=5):
    res = t5_generator(context)
    ans = []
    que = []
    for i, r in enumerate(res):
        if i < n:
            ans.append(r['answer'])
            que.append(r['question'])
    return que, ans


def generate_from_t2t(context, n=5):
    res = t2t_generator.predict([context] * n)
    ans = []
    que = []
    for r in res: