Exemplo n.º 1
0
	def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'):
		import spacy
		global nlp
		if not nlp:
			#print('>> loading spacy...')
			nlp = spacy.load(model_name)

		doc=None
		if self.parsed and load_from_file:
			#print self.fnfn_spacy
			from spacy.tokens.doc import Doc

			try:
				for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')):
					doc = Doc(nlp.vocab)
					doc.from_bytes(byte_string)
			except UnicodeDecodeError:
				print("!! UNICODE ERROR:",self.fnfn_spacy)
		#else:

		if not doc:
			#print '>> making spacy document for text',self.id
			txt=self.text
			txt=clean_text(txt)
			doc=nlp(txt)

		return doc
Exemplo n.º 2
0
def test_efficient_binary_serialization(doc):
    from spacy.tokens.doc import Doc

    byte_string = doc.to_bytes()
    open('moby_dick.bin', 'wb').write(byte_string)

    nlp = spacy.en.English()
    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
       doc = Doc(nlp.vocab)
       doc.from_bytes(byte_string)
Exemplo n.º 3
0
def read_docs(filepath):
    """Deserialize a list of documents + associated metadata"""
    spacy_parser = get_spacy_parser()
    data = pickle.load(open(filepath, 'rb'))
    for row in data:
        doc = Doc(spacy_parser.vocab)
        # read doc object from serialized byte array
        row['content'] = doc.from_bytes(row.pop('binary_content'))
    return data
Exemplo n.º 4
0
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels

for sentence in doc.sents:
    for token in sentence:
        print token
        print token.orth
        dep_labels = dependency_labels_to_root(token)
        print dep_labels
        for dep_label in dep_labels:
            print nlp.vocab.strings[dep_label]

doc = nlp(u"Mr. Best flew to New York on Saturday morning.")

for ent in doc.ents:
    print ent, ent.label_, ent.orth_
    print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_

from spacy.tokens.doc import Doc

byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)

doc = Doc(nlp.vocab)
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
    doc.from_bytes(byte_string)
print doc
Exemplo n.º 5
0
def read_doc(spacy_fname, nlp):
    print('reading ' + spacy_fname)
    byte_string = open(spacy_fname, 'rb').read()
    doc = Doc(nlp.vocab)
    doc.from_bytes(byte_string)
    return doc