Exemplo n.º 1
0
    def __init__(self, text=None, nlp=None, id=0, _doc=None):
        """
		:type text: str
		"""
        spacy.prefer_gpu()
        if isinstance(text, list):
            text = ' '.join([str(x) for x in text])
        self._id = id

        if text is None and _doc is None:
            raise ValueError('Either text or _doc should be given!')
        elif text is None:
            self._doc = _doc
            self._text = _doc.text
            self._nlp = nlp
        elif _doc is None:
            self._doc = None
            self._text = str(text)
            self._nlp = nlp or spacy.load('en_core_web_sm')
        else:
            raise ValueError('Either text or _doc should be None!')

        self._doc = None
        self._tokens = None
        self._noun_chunks = None
        self._entity_chunks = None
        self._entities = None
        self._sentences = None
        self._sentences_method = None
        self._entity_graph = None
        self._syntax_graph = None
Exemplo n.º 2
0
def parse_sentence(text):
    import spacy
    import re
    spacy.prefer_gpu()
    nlp = spacy.load('en')
    boundary = re.compile('^[0-9]$')

    def custom_seg(doc):
        prev = doc[0].text
        length = len(doc)
        for index, token in enumerate(doc):
            if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
                doc[index+1].sent_start = False
            prev = token.text
        return doc

    nlp.add_pipe(custom_seg, before='parser')

    doc = nlp(text)

    return_out = [] 

    for sentence in doc.sents:
        return_out.append(parse_context(sentence.text))
        

    return return_out
Exemplo n.º 3
0
def tokenize_function(lemmatization=True, ngrams_length=2, workers=1):
    spacy_obj = spacy.load('en_core_web_sm')
    spacy.prefer_gpu()

    def tokenize(documents):
        # logging.info('Tokenizing {} documents...'.format(len(documents)))
        tokenized_documents = []
        for i, doc in enumerate(tqdm(spacy_obj.pipe(documents,
                                                    disable=["tagger", "parser", "ner"],
                                                    n_threads=workers),
                                     desc='documents',
                                     total=len(documents))):
            tokens = []
            for token in doc:
                if token.is_stop:
                    token = '#'
                else:
                    if lemmatization:
                        token = token.lemma_
                    else:
                        token = token.text
                    token = remove_symbols(token)
                if token != '':
                    tokens.append(token.lower())

            ngrams = get_ngrams(tokens, max_length=ngrams_length)
            tokenized_documents.append(ngrams)

        return tokenized_documents

    return tokenize
Exemplo n.º 4
0
def spacy_gpu_nlp(text: str = ''):
    # GPU Computation
    spacy.prefer_gpu()

    # Load English tokenizer, tagger, parser, NER and word vectors
    # nlp = spacy.load("en_core_web_sm")   # Efficient - Good
    # nlp = spacy.load("en_core_web_md")  # Blend - Will fail, need to fix test 2_EntityPhraseConcept
    nlp = spacy.load("en_core_web_lg")  # Accurate - Good

    # Process whole document
    doc = nlp(text)

    # Analyze syntax of parts
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

    ne_p_and_c = {
        '0_Verbs': [],
        '1_NounPhrases': [],
        '2_EntityPhraseConcept': []
    }

    # Find named entities, phrases and concepts
    for entity in doc.ents:
        ne_p_and_c['2_EntityPhraseConcept'].append(entity.text)

    ne_p_and_c['1_NounPhrases'] = noun_phrases
    ne_p_and_c['0_Verbs'] = verbs

    # print('Returning from spacy_gpu_nlp')
    # pprint(ne_p_and_c)
    # print('Returning from spacy_gpu_nlp')

    return ne_p_and_c
Exemplo n.º 5
0
def train_model(json_file_path, epochs=20):
    train_data = preprocess(json_file_path)
    spacy.prefer_gpu()
    if len(train_data)>=50:
        nlp = spacy.load(MODEL_PATH)
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
        with nlp.disable_pipes(*other_pipes):  # only train NER
            optimizer = nlp.resume_training()
            for itn in range(epochs):
                print("Starting iteration " + str(itn))
                random.shuffle(train_data)
                losses = {}
                index = 0
                batches = minibatch(train_data)
                for batch in batches:
                    text, annotations = zip(*batch)
                    try:
                        nlp.update(
                            text,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.25,  # dropout - make it harder to memorise data
                            sgd=optimizer,  # callable to update weights
                            losses=losses)
                    except Exception as e:
                        pass
                    
                print(losses)
        nlp.to_disk(MODEL_PATH)
    return "Model trained and saved"
Exemplo n.º 6
0
    def __init__(self,
                 params,
                 spacy_size="md",
                 gpu=True,
                 viz=False,
                 verbose=False):
        self.greed = get_param(params, "greed", greed)
        self.max_dist = get_param(params, "max_dist", max_dist)
        self.max_dist_match = get_param(params, "max_dist_match",
                                        max_dist_match)
        self.blacklist = get_param(params, "blacklist", blacklist)

        spacy_model = "en_core_web_{}".format(spacy_size)
        # DISABLED = ["ner"]  # disable the ner module
        self.verbose = verbose
        if self.verbose:
            print("Loading spacy model...")
        self.nlp = spacy.load(spacy_model, disable=["ner"])

        if gpu:
            spacy.prefer_gpu()

        if viz:
            self.viz = viz

        self.doc = None

        self.init_coref()
Exemplo n.º 7
0
 def __init__(
     self,
     text_field: str,
     doc_field: str,
     language: str = EN_CORE_WEB_SM,
     disable: Optional[List[str]] = None,
     exclude: Optional[List[str]] = None,
     pre: Optional[List[BasePreprocessor]] = None,
     memoize: bool = False,
     memoize_key: Optional[HashingFunction] = None,
     gpu: bool = False,
 ) -> None:
     name = type(self).__name__
     super().__init__(
         name,
         field_names=dict(text=text_field),
         mapped_field_names=dict(doc=doc_field),
         pre=pre,
         memoize=memoize,
         memoize_key=memoize_key,
     )
     self.gpu = gpu
     if self.gpu:
         spacy.prefer_gpu()
     self._nlp = spacy.load(language,
                            disable=disable or [],
                            exclude=exclude or [])
Exemplo n.º 8
0
def get_tokenizer(tokenize_method: str, lang='en'):
    r"""

    :param str tokenize_method: 获取tokenzier方法
    :param str lang: 语言,当前仅支持en
    :return: 返回tokenize函数
    """
    tokenizer_dict = {
        'spacy': None,
        'raw': _raw_split,
        'cn-char': _cn_char_split,
    }
    if tokenize_method == 'spacy':
        import spacy
        spacy.prefer_gpu()
        if lang != 'en':
            raise RuntimeError("Spacy only supports en right right.")
        if parse_version(spacy.__version__) >= parse_version('3.0'):
            en = spacy.load('en_core_web_sm')
        else:
            en = spacy.load(lang)
        tokenizer = lambda x: [w.text for w in en.tokenizer(x)]
    elif tokenize_method in tokenizer_dict:
        tokenizer = tokenizer_dict[tokenize_method]
    else:
        raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.")
    return tokenizer
Exemplo n.º 9
0
def test_prefer_gpu():
    try:
        import cupy  # noqa: F401

        prefer_gpu()
        assert isinstance(get_current_ops(), CupyOps)
    except ImportError:
        assert not prefer_gpu()
Exemplo n.º 10
0
def normalization(inputData):
    data = list(dict.fromkeys(inputData))
    spacy.prefer_gpu()
    nlp = spacy.load("ro_core_news_sm")
    result = list()
    for s in data:
        result.append(nlp(s))
    return result
Exemplo n.º 11
0
 def __init__(self, argv):
     super().__init__(command=__file__, argv=argv)
     spacy.prefer_gpu()
     self.nlp = spacy.load('en_core_web_sm')
     coref = neuralcoref.NeuralCoref(self.nlp.vocab)
     self.nlp.add_pipe(coref, name='neuralcoref')
     self.__text_processor = TextProcessor(self.nlp, self._driver)
     self.create_constraints()
Exemplo n.º 12
0
def load_spacy():
    print('loading spacy...')
    spacy.prefer_gpu()
    pipeline = spacy.load('en_core_web_lg')
    sentencizer = Sentencizer()
    pipeline.add_pipe(sentencizer, first=True)
    print(pipeline.pipeline)  # list of the above

    return pipeline
Exemplo n.º 13
0
 def __init__(self, argv):
     super().__init__(command=__file__, argv=argv)
     spacy.prefer_gpu()
     self.nlp = spacy.load('en_core_web_sm')
     #coref = neuralcoref.NeuralCoref(self.nlp.vocab)
     #self.nlp.add_pipe(coref, name='neuralcoref');
     tr = pytextrank.TextRank()
     self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
     self.__text_processor = TextProcessor(self.nlp, self._driver)
     self.create_constraints()
    def loadSpacyDoc(self, text):
        spacy.prefer_gpu()
        lang = English()
        sentencizer = lang.create_pipe("sentencizer")
        spacyNlp = spacy.load("en_core_web_lg")
        spacyNlp.add_pipe(sentencizer, before="parser")

        # Create SpaCy document for sentencizing and lemmatizing
        spacyDoc = spacyNlp(text)
        return spacyDoc
Exemplo n.º 15
0
def get_tokenizer():
    try:
        import spacy
        spacy.prefer_gpu()
        en = spacy.load('en')
        print('use spacy tokenizer')
        return lambda x: [w.text for w in en.tokenizer(x)]
    except Exception as e:
        print('use raw tokenizer')
        return lambda x: x.split()
Exemplo n.º 16
0
def get_document_embedding(text):
    """Generates the document embedding of a given report
    INPUT: Textual data
    OUTPUT: Text embedding"""
    spacy.prefer_gpu()
    nlp = spacy.load('en_core_sci_md')
    doc = nlp(text)
    embedding = doc.vector
    del nlp
    return embedding
Exemplo n.º 17
0
    def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'):
        print(os.path.dirname(spacy.__file__))
        if ExtractInformation.IS_GPU:
            spacy.prefer_gpu()

        self.modelSpacy = modelSpacy
        self.modelCoref = modelCoref
        self.stanfordClient = StanfordOpenIE()

        self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)
Exemplo n.º 18
0
    def load(self, path, prefer_gpu=False):
        """
        Loads a spaCy model and sets it as new self.model.

        :param path: Path to directory of spaCy model.
        """
        if prefer_gpu:
            spacy.prefer_gpu()
        nlp = spacy.load(path)
        self.model = nlp
Exemplo n.º 19
0
    def __init__(self, tweet_frequency=800):
        self.logpath = './log/io/csv/cleaner/'
        self.rpath = './data/csv/metadata.csv'

        self.logger()
        self.df = self.csv_to_dataframe()
        spacy.prefer_gpu()
        self.nlp = spacy.load('en')

        self.tweet_freq = tweet_frequency
Exemplo n.º 20
0
def tokzenize(df):
    def helper(sent):
        tokens = [x.text for x in nlp.tokenizer(sent)]
        if len(tokens) > 300:
            return tokens[:300]
        return tokens

    spacy.prefer_gpu()
    nlp = spacy.load('en_core_web_sm')
    df['tokenized'] = df['reviewText'].apply(helper)
    return df
Exemplo n.º 21
0
    def __init__(self, spacy_model_name=None, cuda=-1):
        if cuda >= 0:
            spacy.prefer_gpu()

        if spacy_model_name is None:
            self.nlp = spacy.blank("en")  # create blank Language class
            logging.info("Created blank 'en' model")
        else:
            self.nlp = spacy.load(
                spacy_model_name)  # load existing spaCy model
            self.new = False
            logging.info("\nLoaded model '%s'", spacy_model_name)
Exemplo n.º 22
0
def find_name(text):
    # use gpu - faster
    spacy.prefer_gpu()  # comment out this line if GPU is not available

    # load English SpaCy model
    nlp = spacy.load("en_core_web_sm")

    # get encodings from text
    doc = nlp(text)

    # return person category
    return [ee for ee in doc.ents if ee.label_ == 'PERSON']
Exemplo n.º 23
0
 def setupModel(self):
     """
     Setup the NLP models for computing similarity.
     """
     try:
         # prefer to use GPU if available:
         spacy.prefer_gpu()
         spacymodel = self.app_config.checkAndSanitizeConfigString(
             'plugins', 'mod_dedupe_spacymodel')
         self.nlpModel = spacy.load(spacymodel)
     except Exception as e:
         logger.error("Error loading the NLP model for de-dupe: %s", e)
Exemplo n.º 24
0
def convert_to_vector_representation(data):
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")
    vectors = []
    for document, star in tqdm(data):
        #document = nlp(document)
        document_vectors = []
        for i in document:
            i = nlp(i)
            document_vectors.append(i.vector)
        vectors.append((torch.tensor(document_vectors), star))
    return vectors
Exemplo n.º 25
0
def dependency_parse(filepath):
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")

    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    tokpath = os.path.join(dirpath, filepre + '.toks')
    parentpath = os.path.join(dirpath, filepre + '.parents')
    relpath = os.path.join(dirpath, filepre + '.rels')
    pospath = os.path.join(dirpath, filepre + '.pos')
    tagpath = os.path.join(dirpath, filepre + '.tag')
    lenpath = os.path.join(dirpath, filepre + '.len')

    with open(tokpath, 'w', encoding='utf-8') as tokfile, \
            open(relpath, 'w', encoding='utf-8') as relfile, \
            open(parentpath, 'w', encoding='utf-8') as parfile, \
            open(lenpath, 'w', encoding='utf-8') as lenfile, \
            open(tagpath, 'w', encoding='utf-8') as tagfile, \
            open(pospath, 'w', encoding='utf-8') as posfile:
        with open(os.path.join(dirpath, 'a.txt'), 'r', encoding='utf-8') as f:
            for line in f:
                l = line.split(' ')
                l = [i for i in l if i != '']
                newline = ' '.join(l)
                doc = nlp(newline)
                json_doc = doc.to_json()
                token = json_doc['tokens']
                pos = []
                tag = []
                dep = []
                tok = []
                parent = []
                length = json_doc['sents'][0]['end'] + 1
                for t in token:
                    if t['pos'] != 'SPACE':
                        tok.append(doc[t['id']].text)
                        pos.append(t['pos'])
                        tag.append(t['tag'])
                        dep.append(t['dep'])
                        head = t['head']
                        if t['dep'] == 'ROOT':
                            head = 0
                        else:
                            head = head + 1
                        parent.append(head)
                tokfile.write(' '.join(tok) + '\n')
                posfile.write(' '.join(pos) + '\n')
                tagfile.write(' '.join(tag) + '\n')
                relfile.write(' '.join(dep) + '\n')
                parfile.writelines(["%s " % str(item) for item in parent])
                parfile.write('\n')
                lenfile.write(str(length) + '\n')
Exemplo n.º 26
0
    def __init__(self,
                 lang: str = "en_core_web_sm",
                 nlp: spacy.language.Language = None,
                 neuralcoref: bool = False,
                 device: str = None,
                 *args,
                 **kwargs):

        # Set all the parameters
        self.lang = lang
        self.neuralcoref = neuralcoref
        self._prebuilt = True

        # Set the device
        self._on_gpu = False
        if device and (device == "gpu" or device.startswith("cuda")):
            spacy.prefer_gpu(
                gpu_id=0 if ":" not in device else int(device.split(":")[1]))
            # Spacy sets the default torch float Tensor to torch.cuda.FloatTensor,
            # which causes other GPU cachedops to crash.
            torch.set_default_tensor_type("torch.FloatTensor")
            self._on_gpu = True

        # Load up the Spacy module
        self._nlp = nlp
        if not nlp:
            self._nlp = self._load_spacy(lang=lang)
            self._prebuilt = False

        # Add neuralcoref
        self._add_neuralcoref()

        if not nlp:
            super(Spacy, self).__init__(
                lang=lang,
                neuralcoref=neuralcoref,
                *args,
                **kwargs,
            )
        else:
            super(Spacy, self).__init__(
                lang=nlp.lang,
                # No need to pass in neuralcoref separately, it's already in the
                # pipeline if neuralcoref=True
                pipeline=nlp.pipe_names,
                *args,
                **kwargs,
            )
            print(
                "Warning: Spacy.encode does not support arbitrary nlp pipelines so "
                "information stored in the Doc object may be lost in encoding."
            )
Exemplo n.º 27
0
 def transform(self, X):
     spacy.prefer_gpu()
     nlp = spacy.load("en_core_web_lg")
     new = []
     for sentence in X:
         doc = nlp(sentence)
         json_doc = doc.to_json()
         token = json_doc['tokens']
         tag = []
         for t in token:
             tag.append(t['tag'])
         new.append(" ".join(tag))
     return new
Exemplo n.º 28
0
    def __init__(self, limit=10000, enable_gpu=False):
        self.limit = limit
        self.library = None
        self.texts = collections.OrderedDict()

        if enable_gpu:
            spacy.prefer_gpu()
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.max_length = self.limit
        self.texts_pos = collections.OrderedDict()

        self.texts_stats = collections.OrderedDict()
        self.texts_dists = collections.OrderedDict()
Exemplo n.º 29
0
def parse_context(text):
    import spacy
    spacy.prefer_gpu()
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    #return_out = [text]
    return_out = []
    for token in doc:

            
        return_out.append(token.pos_)
    
    return return_out
    def __init__(self,
                 subreddit=None,
                 min_score=4,
                 min_len=8,
                 min_sent_len=6,
                 max_sent_len=32,
                 author=None,
                 before=None,
                 size=32,
                 max_retries=5,
                 counter_file="vocab_counter.pkl",
                 collectionp_file="collection.pkl",
                 sentence_file="sentences.txt",
                 spacy_use=False):
        self.subreddit = subreddit
        self.min_score = min_score
        self.min_len = min_len
        self.min_sent_len = min_sent_len
        self.max_sent_len = max_sent_len
        self.author = author
        self.before = before
        self.size = size
        self.max_retries = max_retries
        # a praw.ini file is needed in the directory for this scraper to work
        self.reddit = praw.Reddit("scraper1")
        if spacy_use:
            spacy.prefer_gpu()
            self.nlp = spacy.load("en")
        else:
            download('punkt')
        self.spacy_use = spacy_use
        self.counter = collections.Counter()
        self.commentids = []
        self.after = None

        self.collectionp_file = collectionp_file
        self.counter_file = counter_file
        self.sentence_file = sentence_file

        self.sent_count = 0

        if os.path.isfile(self.collectionp_file):
            self.collectionp = pickle.load(open(self.collectionp_file, "rb"))
            self.commentids = self.collectionp["commentids"]
            self.after = self.collectionp["after"]
            self.sent_count = self.collectionp["sentence_count"]

        if os.path.isfile(self.counter_file):
            self.counter = pickle.load(open(self.counter_file, "rb"))
Exemplo n.º 31
0
def test_prefer_gpu():
    assert not prefer_gpu()