예제 #1
0
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
예제 #2
0
    def __init__(self, nlp=None, coref=None):
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            self.nlp = spacy.load(model)

        if not coref:
            try:
                from neuralcoref_lib.neuralcoref import Coref
                self.coref = Coref(nlp=self.nlp)
            except:
                self.coref = coref
                logging.info('Coreference not available')

        else:
            self.coref = coref

        self.spacy_tags = {
            'pos': {
                'noun': ['NOUN', 'PROPN'],
                'pronoun': ['PRON']
            },
            'dep': {
                'subject': ['csubj', 'csubjpass', 'nsubj',
                            'nsubjpass'],  #'agent','expl',
                'object': ['dobj', 'iobj', 'oprd']
            }
        }  # ADD 'attr' for spacy 2
        self.conversational_pronouns = ["i", "you"]
        self.grammatical_ranking = ['S', 'O', 'X']
 def __init__(self, nlp=None,greedyness=0.5, max_dist=50, max_dist_match=500, conll=None,
              blacklist=True, debug=False):
     self.greedyness = greedyness
     self.max_dist = max_dist
     self.max_dist_match = max_dist_match
     self.debug = debug
     embed_path = 'weights/'
     if embed_path is not None:
         self.embed_extractor = EmbeddingExtractor(embed_path)
     #model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
     #model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
     model_path = "checkpoints/"
     print("Loading neuralcoref model from", model_path)
     self.coref_model = CModel(model_path)
     if nlp is None:
         print("Loading spacy model")
         try:
             spacy.info('en_core_web_sm')
             model = 'en_core_web_sm'
         except IOError:
             print("No spacy 2 model detected, using spacy1 'en' model")
             spacy.info('en')
             model = 'en'
         nlp = spacy.load(model)
     self.data = Document(nlp, conll=conll, blacklist=blacklist, model_path='weights/')
     self.clusters = {}
     self.mention_to_cluster = []
     self.mentions_single_scores = {}
     self.mentions_pairs_scores = {}
예제 #4
0
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
예제 #5
0
def mention_detection_debug(sentence):
    print("🌋 Loading spacy model")
    try:
        spacy.info("en_core_web_sm")
        model = "en_core_web_sm"
    except IOError:
        print("No spacy 2 model detected, using spacy1 'en' model")
        spacy.info("en")
        model = "en"
    nlp = spacy.load(model)
    doc = nlp(sentence.decode("utf-8"))
    mentions = extract_mentions_spans(doc, blacklist={}, debug=True)
    for mention in mentions:
        print(mention)
예제 #6
0
def mention_detection_debug(sentence):
    print(u"🌋 Loading spacy model")
    try:
        spacy.info('en_core_web_sm')
        model = 'en_core_web_sm'
    except IOError:
        print("No spacy 2 model detected, using spacy1 'en' model")
        spacy.info('en')
        model = 'en'
    nlp = spacy.load(model)
    doc = nlp(sentence.decode('utf-8'))
    mentions = extract_mentions_spans(doc, blacklist=False, debug=True)
    for mention in mentions:
        print(mention)
예제 #7
0
def list_linked_spacy_models():
    """ Read SPACY/data and return a list of link_name """
    spacy_data = os.path.join(spacy.info(silent=True)['Location'], 'data')
    linked = [d for d in os.listdir(spacy_data) if os.path.islink(os.path.join(spacy_data, d))]
    # linked = [os.path.join(spacy_data, d) for d in os.listdir(spacy_data)]
    # linked = {os.readlink(d): os.path.basename(d) for d in linked if os.path.islink(d)}
    return linked
예제 #8
0
def get_nlp(language):
    """Get NLP using spacy."""
    import spacy  # pylint: disable=import-outside-toplevel

    if language not in spacy.info()["Models"]:
        spacy.cli.download(language)
    return spacy.load(language)
예제 #9
0
    def __init__(self):
        logger.info("Loading NLP model: spaCy en_core_web_lg")

        self.nlp = {
            "en": spacy.load("en_core_web_lg", disable=['parser', 'tagger'])
        }

        logger.info("Printing spaCy model and package details:"
                    "\n\n {}\n\n".format(spacy.info("en_core_web_lg")))
예제 #10
0
    def test_download(self):
        # Download model beforehand
        model_path = download_model('spacy', DEFAULT_CACHE_DIR,
                                    process_func=_unzip_process_func,
                                    verbose=True)

        info = spacy.info(model_path)
        self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner'])
        self.assertEqual(info['lang'], 'da')
    def __init__(self, models=None):
        if not models:
            models = {"en": "en_core_web_lg"}
        logger.debug(f"Loading SpaCy models: {models.values()}")

        self.nlp = {
            lang_code: spacy.load(model_name, disable=['parser', 'tagger'])
            for lang_code, model_name in models.items()
        }

        for model_name in models.values():
            logger.debug("Printing spaCy model and package details:"
                         "\n\n {}\n\n".format(spacy.info(model_name)))
예제 #12
0
 def __init__(
     self,
     nlp=None,
     greedyness=0.5,
     max_dist=50,
     max_dist_match=500,
     conll=None,
     blacklist=True,
     debug=False,
 ):
     self.greedyness = greedyness
     self.max_dist = max_dist
     self.max_dist_match = max_dist_match
     self.debug = debug
     model_path = os.path.join(
         PACKAGE_DIRECTORY,
         "weights/conll/" if conll is not None else "weights/")
     model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
     print("Loading neuralcoref model from", model_path)
     self.coref_model = Model(model_path)
     if nlp is None:
         print("Loading spacy model")
         try:
             spacy.info("en_core_web_sm")
             model = "en_core_web_sm"
         except IOError:
             print("No spacy 2 model detected, using spacy1 'en' model")
             spacy.info("en")
             model = "en"
         nlp = spacy.load(model)
     self.data = Document(nlp,
                          conll=conll,
                          blacklist=blacklist,
                          model_path=model_path)
     self.clusters = {}
     self.mention_to_cluster = []
     self.mentions_single_scores = {}
     self.mentions_pairs_scores = {}
예제 #13
0
파일: spacy.py 프로젝트: Jomcgi/scrubadub
    def check_spacy_model(model) -> bool:
        spacy_info = spacy.info()
        models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys())
        if models is None:
            raise ValueError('Unable to detect spacy models.')

        if model not in models:
            msg.info("Downloading spacy model {}".format(model))
            spacy.cli.download(model)
            # spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it
            models.append(model)

        # Always returns true, if it fails to download, spacy sys.exit()s
        return model in models
예제 #14
0
def str2spacy(model):
    if int(spacy.__version__.split('.')[0]) < 3:
        downloaded_models = [os.path.basename(m) for m in list_downloaded_spacy_models()]
        links = list_linked_spacy_models()
    else:
        # As of spacy v3, links do not exist anymore and it is simpler to get a list of
        # downloaded models
        downloaded_models = list(spacy.info()['pipelines'])
        links = []
    filtered_downloaded = [m for m in downloaded_models if m[:2] == model]
    if model in downloaded_models + links:
        # Check whether `model` is the name of a model/link
        return model
    elif filtered_downloaded:
        # Check whether `model` is a lang code and corresponds to a downloaded model
        return filtered_downloaded[0]
    else:
        # Return asked model to have an informative error.
        return model
예제 #15
0
def mention_detection_debug(sentence, lang="en"):
    print(u"🌋 Loading spacy model")
    if lang == "en":
        try:
            spacy.info('en_core_web_sm')
            model = 'en_core_web_sm'
        except IOError:
            print("No spacy 2 model detected, using spacy1 'en' model")
            spacy.info('en')
            model = 'en'
    elif lang == "de":
        try:
            spacy.info("de_core_news_sm")
            model = "de_core_news_sm"
        except IOError as e:
            print("german model not there")
    nlp = spacy.load(model)
    doc = nlp(sentence)
    mentions = extract_mentions_spans(doc, blacklist=False, debug=True)
    print("==-=-= Extracted mentions =-=-==")
    for mention in mentions:
        print(mention)
예제 #16
0
    'pt': 'pt_core_news_sm',
    'it': 'it_core_news_sm',
    'nl': 'nl_core_news_sm',
    'xx': 'xx_core_news_sm'
}
from flask import Flask, request, jsonify
import spacy, re, os, sys

if not (sys.argv[1]):
    print(
        "You must give the iso_code of the language model to launch (fr,en,de,es,pt,it,nl,xx). Exiting."
    )
    exit()
else:
    # get spacy info
    spacy.info()
    # get path of models
    #path = spacy.util.get_data_path()
    #print("Path to models : ",path)
    # now load spacy model for the language
    lang = sys.argv[1]  # CHANGE ACCORDING TO THE LANGUAGE YOU WORK ON !
    try:
        nlp = spacy.load(lang_models[lang])
        app = Flask(__name__)
        print('Model : ' + lang + " loaded!")
    except Exception as e:
        print("Bad iso code or other error : ", str(e))
        exit()


@app.route("/check")
예제 #17
0
def load_language_resource(descriptor):
    print(spacy.info(vector_descriptor))
    nlp = spacy.load(vector_descriptor)
    return nlp
예제 #18
0
 def read_corpus(self, data_path, debug=False):
     print("🌋 Reading files")
     for dirpath, _, filenames in os.walk(data_path):
         print("In", dirpath, os.path.abspath(dirpath))
         file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
                     or f.endswith(".v4_gold_conll")]
         cleaned_file_list = []
         for f in file_list:
             fn = f.split('.')
             if fn[1] == "v4_auto_conll":
                 gold = fn[0] + "." + "v4_gold_conll"
                 if gold not in file_list:
                     cleaned_file_list.append(f)
             else:
                 cleaned_file_list.append(f)
         doc_list = parallel_process(cleaned_file_list, load_file)
         for docs in doc_list:  #executor.map(self.load_file, cleaned_file_list):
             for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs:
                 print("Imported", name)
                 if debug:
                     print("utts_text", utts_text)
                     print("utt_tokens", utt_tokens)
                     print("utts_corefs", utts_corefs)
                     print("utts_speakers", utts_speakers)
                     print("name, part", name, part)
                 self.utts_text += utts_text
                 self.utts_tokens += utt_tokens
                 self.utts_corefs += utts_corefs
                 self.utts_speakers += utts_speakers
                 self.utts_doc_idx += [len(self.docs_names)
                                       ] * len(utts_text)
                 self.docs_names.append((name, part))
     print("utts_text size", len(self.utts_text))
     print("utts_tokens size", len(self.utts_tokens))
     print("utts_corefs size", len(self.utts_corefs))
     print("utts_speakers size", len(self.utts_speakers))
     print("utts_doc_idx size", len(self.utts_doc_idx))
     print("🌋 Building docs")
     for name, part in self.docs_names:
         self.docs.append(
             ConllDoc(name=name,
                      part=part,
                      nlp=None,
                      blacklist=False,
                      consider_speakers=True,
                      embedding_extractor=self.embed_extractor,
                      conll=CONLL_GENRES[name[:2]]))
     print("🌋 Loading spacy model")
     try:
         spacy.info('en_core_web_sm')
         model = 'en_core_web_sm'
     except IOError:
         print("No spacy 2 model detected, using spacy1 'en' model")
         spacy.info('en')
         model = 'en'
     nlp = spacy.load(model)
     print("🌋 Parsing utterances and filling docs")
     doc_iter = (s for s in self.utts_text)
     for utt_tuple in tqdm(
             zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs,
                 self.utts_speakers, self.utts_doc_idx)):
         spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
         if debug:
             print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
         doc = spacy_tokens
         if debug:
             out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \
                       " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)
             print(out_str.encode('utf-8'))
         self.docs[doc_id].add_conll_utterance(
             doc,
             conll_tokens,
             corefs,
             speaker,
             use_gold_mentions=self.use_gold_mentions)
	def read_corpus(self, data_path, debug=False):
		# this function holds the key to constructing the memory module for the conll corpus
		# find the discourse end marker, that holds the key to forming stories for memory inference
		print("🌋 Reading files")
		#read_corpus_input = input()
		dir_walk_count = 0
		for dirpath, _, filenames in os.walk(data_path):
			#dir_walk_count += 1
			#if dir_walk_count > 5 :
			#    break
			print("In", dirpath, os.path.abspath(dirpath))
			file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \
						or f.endswith(".v4_gold_conll")]
			cleaned_file_list = []
			for f in file_list:
				fn = f.split('.')
				if fn[1] == "v4_auto_conll":
					gold = fn[0] + "." + "v4_gold_conll"
					if gold not in file_list:
						cleaned_file_list.append(f)
				else:
					cleaned_file_list.append(f)
			#doc_list = parallel_process(cleaned_file_list, load_file)

			# what is the doc list ?
			for file in cleaned_file_list:#executor.map(self.load_file, cleaned_file_list):
				docs = load_file(file)
				for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs:
					print("Imported", name)
					if debug:
						print("utts_text", utts_text)
						print("utt_tokens", utt_tokens)
						print("utts_corefs", utts_corefs)
						print("utts_speakers", utts_speakers)
						print("name, part", name, part)
						#utterances_in_doc_input = input()
					self.utts_text += utts_text
					self.utts_tokens += utt_tokens
					self.utts_corefs += utts_corefs
					self.utts_speakers += utts_speakers
					self.utts_doc_idx += [len(self.docs_names)] * len(utts_text)
					self.docs_names.append((name, part))
		print("utts_text size", len(self.utts_text))
		print("utts_tokens size", len(self.utts_tokens))
		print("utts_corefs size", len(self.utts_corefs))
		print("utts_speakers size", len(self.utts_speakers))
		print("utts_doc_idx size", len(self.utts_doc_idx))
		print("🌋 Building docs")
		for name, part in self.docs_names:
			self.docs.append(ConllDoc(name=name, part=part, nlp=None,
									  blacklist=False, consider_speakers=True,
									  embedding_extractor=self.embed_extractor,
									  conll=CONLL_GENRES[name[:2]]))
		print("🌋 Loading spacy model")
		try:
			spacy.info('en_core_web_sm')
			model = 'en_core_web_sm'
		except IOError:
			print("No spacy 2 model detected, using spacy1 'en' model")
			spacy.info('en')
			model = 'en'
		nlp = spacy.load(model)
		print("🌋 Parsing utterances and filling docs")
		doc_iter = (s for s in self.utts_text)
		for utt_tuple in tqdm(zip(nlp.pipe(doc_iter),
										   self.utts_tokens, self.utts_corefs,
										   self.utts_speakers, self.utts_doc_idx)):
			spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
			if debug:
				print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
				#spacy_tokens_in_doc = input()
			doc = spacy_tokens
			if debug: 
				out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \
						  " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)
				
				print(out_str.encode('utf-8'))
				print("CONLL TOKENS HERE ARE")
				print(Doc(nlp.vocab,conll_tokens))
				print("SENTENCE EMBEDDING OF TOKENS ARE")
				#print(self.embed_extractor.get_average_embedding(conll_tokens))
				#out_str_input2 = input()
			self.docs[doc_id].add_conll_utterance(doc, conll_tokens, corefs, speaker,
												  use_gold_mentions=self.use_gold_mentions)
			del spacy_tokens, conll_tokens, corefs,speaker, doc_id
		del nlp, doc_iter
예제 #20
0
    def read_corpus(self, data_path, model=None, debug=False):
        print("🌋 Reading files")
        for dirpath, _, filenames in os.walk(data_path):
            print("In", dirpath, os.path.abspath(dirpath))
            file_list = [
                os.path.join(dirpath, f) for f in filenames
                if f.endswith(".v4_auto_conll") or f.endswith(".v4_gold_conll")
            ]
            cleaned_file_list = []
            for f in file_list:
                fn = f.split(".")
                if fn[1] == "v4_auto_conll":
                    gold = fn[0] + "." + "v4_gold_conll"
                    if gold not in file_list:
                        cleaned_file_list.append(f)
                else:
                    cleaned_file_list.append(f)
            doc_list = parallel_process(cleaned_file_list, load_file)
            for docs in doc_list:  # executor.map(self.load_file, cleaned_file_list):
                for (
                        utts_text,
                        utt_tokens,
                        utts_corefs,
                        utts_speakers,
                        name,
                        part,
                ) in docs:
                    if debug:
                        print("Imported", name)
                        print("utts_text", utts_text)
                        print("utt_tokens", utt_tokens)
                        print("utts_corefs", utts_corefs)
                        print("utts_speakers", utts_speakers)
                        print("name, part", name, part)
                    self.utts_text += utts_text
                    self.utts_tokens += utt_tokens
                    self.utts_corefs += utts_corefs
                    self.utts_speakers += utts_speakers
                    self.utts_doc_idx += [len(self.docs_names)
                                          ] * len(utts_text)
                    self.docs_names.append((name, part))
        print("utts_text size", len(self.utts_text))
        print("utts_tokens size", len(self.utts_tokens))
        print("utts_corefs size", len(self.utts_corefs))
        print("utts_speakers size", len(self.utts_speakers))
        print("utts_doc_idx size", len(self.utts_doc_idx))
        print("🌋 Building docs")
        for name, part in self.docs_names:
            self.docs.append(
                ConllDoc(
                    name=name,
                    part=part,
                    nlp=None,
                    blacklist=self.blacklist,
                    consider_speakers=True,
                    embedding_extractor=self.embed_extractor,
                    conll=CONLL_GENRES[name[:2]],
                ))
        print("🌋 Loading spacy model")

        if model is None:
            model_options = [
                "en_core_web_lg", "en_core_web_md", "en_core_web_sm", "en"
            ]
            for model_option in model_options:
                if not model:
                    try:
                        spacy.info(model_option)
                        model = model_option
                        print("Loading model", model_option)
                    except:
                        print("Could not detect model", model_option)
            if not model:
                print("Could not detect any suitable English model")
                return
        else:
            spacy.info(model)
            print("Loading model", model)
        nlp = spacy.load(model)
        print("🌋 Parsing utterances and filling docs with use_gold_mentions=" +
              (str(bool(self.gold_mentions))))
        doc_iter = (s for s in self.utts_text)
        for utt_tuple in tqdm(
                zip(
                    nlp.pipe(doc_iter),
                    self.utts_tokens,
                    self.utts_corefs,
                    self.utts_speakers,
                    self.utts_doc_idx,
                )):
            spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple
            if debug:
                print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens)
            doc = spacy_tokens
            if debug:
                out_str = ("utterance " + unicode_(doc) + " corefs " +
                           unicode_(corefs) + " speaker " + unicode_(speaker) +
                           "doc_id" + unicode_(doc_id))
                print(out_str.encode("utf-8"))
            self.docs[doc_id].add_conll_utterance(
                doc,
                conll_tokens,
                corefs,
                speaker,
                use_gold_mentions=self.gold_mentions)
예제 #21
0
import spacy

spacy.info()  # Muestra los metadatos de los modelos.

# Leer el modelo:

# nlp = spacy.load('en') # No funciona en MacOS a menos que antes se corra el siguiente comando con sudo:
# $sudo python -m spacy link MODELO es
# siendo MODELO 'es_core_news_md' o 'es_core_news_sm' (el que se haya descargado).

nlp = spacy.load(
    '/Users/lino/anaconda3/lib/python3.6/site-packages/es_core_news_sm/es_core_news_sm-2.0.0'
)

# Cargar un texto

doc = nlp(
    u'Apenas faltan 24 horas para que Tim Cook, consejero delegado de Apple, vuelva al Steve Jobs Theatre, la única zona de su nuevo campus, junto con el centro de visitantes donde pueden acceder los no empleados de la manzana.'
)

# Tokenización:

tokens = [t.text for t in doc]
tokens_palabras = [t.orth_ for t in doc
                   if not t.is_punct]  # Tokens sin signos de puntuación
tokens_enteros = [token.orth for token in doc
                  ]  # Usa una representación NUMERICA de los tokens
tokens_lexicos = [t.orth_ for t in doc if not t.is_punct | t.is_stop
                  ]  # Tokens sin puntuación ni stopwords

from spacy.symbols import ORTH, LEMMA, POS, TAG
@Created   : 5/31/18 2:02 PM
@Desc      :
!pip install spacy -i https://pypi.mirrors.ustc.edu.cn/simple
!python -m spacy download en_core_web_md
"""

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from time import time
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from functools import reduce

import spacy
from joblib import Parallel, delayed

spacy.info()
spacy.info('en_core_web_md')
# /home/deco/miniconda2/envs/tf17/lib/python3.6/site-packages/en_core_web_md


def load_simple():
    # import en_core_web_md
    # nlp = en_core_web_sm.load()
    nlp = spacy.load('en_core_web_md')
    # nlp is a Language instance
    print('pipeline:', nlp.pipeline)
    print('pipe_names:', nlp.pipe_names)

    doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
    # callable instance returning Doc instance
    print('tokens:')
예제 #23
0
import spacy

spacy.info('en')
#spacy.info('en', markdown=True)

nlp = spacy.load('en')
doc = nlp(u"This is a sentence.")
print(doc[0].text)
print(doc[1].text)
print(doc[-1].text)
print(doc[2:3].text)
print([(w.text, w.pos_) for w in doc])

print('*******')
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)
print('*******')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

print('*******')
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_,
          lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit, lexeme.is_title,
          lexeme.lang_)
예제 #24
0
for i in range(10):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)
review_df.shape

# ### Using spacy: [Installation instructions for spacy](https://spacy.io/docs/usage/)

# In[28]:

import spacy

# In[29]:

# model meta data
spacy.info('en_core_web_sm')

# In[30]:

# preload the language model
nlp = spacy.load('en_core_web_sm')

# In[31]:

review_df['text'][:2]

# In[32]:

# Keeping it in a pandas dataframe
doc_df = review_df['text'].apply(nlp)