def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, **kwargs): """ @param nlp: Spacy Language model @param support_overlap: whether need to support overlapped annotations @param log_level: logging level configuration @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param kwargs:other parameters """ for param_name, value in kwargs.items(): setattr(self, param_name, value) if nlp is None: raise NameError('parameter "nlp" need to be defined') self.nlp = nlp self.encoding = encoding self.doc_name_depth = doc_name_depth self.support_overlap = support_overlap self.set_logger(log_level) if not Doc.has_extension('doc_name'): Doc.set_extension('doc_name', default='') pass
def handle(self, *args, **options): spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat']) Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True) Span.set_extension('line_number', getter=Command.line_number_getter, force=True) Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines) Doc.set_extension('_lines', default=list()) logger.debug("Loaded spacy server") main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT) while True: readable, writeable, exceptions = select(read_socks, write_socks, []) for sockobj in readable: if sockobj in main_socks: new_sock, address = sockobj.accept() logger.debug('Connect: %s - %s', address, id(new_sock)) read_socks.append(new_sock) else: try: entities = [] data = recv_end(sockobj) if not data: sockobj.close() read_socks.remove(sockobj) else: for doc in spacy_model.pipe([data]): doc._.lines = [x.start() for x in re.finditer('\n', doc.text)] for ent in doc.ents: current_entity = self.get_ent(ent) entities.append(current_entity) if current_entity else None sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8')) except: pass
def __init__(self): super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Token.has_extension('is_lexical'): Token.set_extension('is_lexical', default=False)
def add_span_extensions(): Doc.set_extension("relations", default=None) Doc.set_extension("entities", default=None) for span_extension in [ 'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest', 'hansardParent', 'snomedct', 'synonyms' ]: Span.set_extension(span_extension, default=None)
def __init__(self, clf, extension='score'): """ :type clf: Classifier, needs to have a predict(X) function """ self.clf = clf self.extension = extension if not Doc.has_extension(extension): Doc.set_extension(extension, default=-1)
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def __init__(self, links, **kwargs): self.start_urls.append(links) import spacy from spacy.tokens.doc import Doc from spacy.tokens.span import Span self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER) Span.set_extension('line_number', getter=TagLinkSpider.line_number_getter, force=True) Doc.set_extension('lines', getter=TagLinkSpider.get_lines, setter=TagLinkSpider.set_lines) Doc.set_extension('_lines', default=list()) self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) connect(self.soc_spacy, '', settings.SPACY_PORT) super().__init__(**kwargs)
def __init__(self): if not Doc.has_extension('taaled_lemmas'): Doc.set_extension('taaled_lemmas', default=[]) if not Doc.has_extension('context_tokens'): Doc.set_extension('context_tokens', default=[]) if not Doc.has_extension('function_tokens'): Doc.set_extension('function_tokens', default=[]) # Load TAALED word list files # source: https://github.com/kristopherkyle/TAALED/tree/master/TAALED_1_3_1_Py3/dep_files module_path = os.path.abspath(os.path.dirname(__file__)) adj_lem_list_path = os.path.join(module_path, "Corpora/adj_lem_list.txt") real_words_path = os.path.join(module_path, "Corpora/real_words.txt") self.adj_word_list = open(adj_lem_list_path, "r", errors='ignore').read().split("\n")[:-1] self.real_word_list = open(real_words_path, "r", errors='ignore').read().split("\n")[:-1]
def init_component(self): if not Doc.has_extension("extract_keywords"): Doc.set_extension("extract_keywords", method=self.extract_keywords) if not Doc.has_extension("kw_candidates"): Doc.set_extension("kw_candidates", default=None)
"""mapping of char offset to token index; token whitespace included; faster than approch in documentation""" d = {tok.idx: i for i, tok in enumerate(spacy_doc)} i = 0 for idx in range(spacy_doc[-1].idx + len(spacy_doc[-1].text) + 1): if idx in d: i = d[idx] else: d[idx] = i return d # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Doc extensions to set mapping of chr offsets ("idx") to token index ("ti") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Doc.set_extension('_idx_to_ti_map', default=None) def set_idx_to_ti_map(doc): doc._._idx_to_ti_map = _chr2tok(doc) def get_idx_to_ti_map(doc): if doc._._idx_to_ti_map is None: set_idx_to_ti_map(doc) return doc._._idx_to_ti_map Doc.set_extension('idx_to_ti_map', getter=get_idx_to_ti_map)
def __init__(self): if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Doc.has_extension(self.name + '_legacy'): Doc.set_extension(self.name + '_legacy', default=[])
def __init__(self): if not Doc.has_extension("features"): Doc.set_extension("features", default=OrderedDict())
from typing import Dict, List from allennlp.data import Batch from dygie.models.dygie import DyGIE from dygie.data.dataset_readers.dygie import DyGIEReader from allennlp.models.archival import load_archive from allennlp.nn import util from spacy.language import Language from spacy.tokens import Span from spacy.tokens.doc import Doc from spacy.tokens.span import Span Doc.set_extension("rels", default=[], force=True) Span.set_extension("rels", default=[], force=True) Doc.set_extension("span_ents", default=[], force=True) Span.set_extension("label_", default=[], force=True) Doc.set_extension("events", default=[], force=True) Span.set_extension("events", default=[], force=True) def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc: doc_rels = [] doc_evs = [] # store events as relations. include confidence scores in the relation tuple (TODO: add relation property) for evs, ds in zip(prediction.get("predicted_events", []), doc.sents): sent_evs = [] for ev in evs: if len(ev) >= 3: trig = [r for r in ev if r[1] == "TRIGGER"] arg0s = [r for r in ev if r[2] == "ARG0"] #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]] arg1s = [r for r in ev if r[2] == "ARG1"]
def __init__(self, paths=None): """ paths:list -> a list of string, each of which represents a path to one of the corpora needed as listed below. This method initialized constant accross the object to be used by other methods of this object """ super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) Doc.set_extension('ngsl_words', default=[]) Doc.set_extension('nawl_words', default=[]) Doc.set_extension('tsl_words', default=[]) Doc.set_extension('fpc_words', default=[]) Doc.set_extension('cocaacad_words', default=[]) Doc.set_extension('cocatech_words', default=[]) Doc.set_extension('cocagenband1_words', default=[]) Doc.set_extension('cocagenband2_words', default=[]) Doc.set_extension('cocagenband3_words', default=[]) if paths is None: #file locations self.fnameNGSL = os.path.join( os.path.dirname(__file__), 'Corpora/NGSL+1.01+by+band - Frequency.csv') self.fnameNAWL = os.path.join(os.path.dirname(__file__), 'Corpora/NAWL_SFI.csv') self.fnameBSL = os.path.join( os.path.dirname(__file__), 'Corpora/BSL_1.01_SFI_freq_bands.csv') self.fnameTSL = os.path.join( os.path.dirname(__file__), 'Corpora/TSL+1.1+Ranked+by+Frequency - TSL.csv') self.fnameCOCAAcad = os.path.join(os.path.dirname(__file__), 'Corpora/COCA Academic.csv') self.fnameCOCATech = os.path.join(os.path.dirname(__file__), 'Corpora/COCA Technical.csv') self.fnameCOCAGen = os.path.join(os.path.dirname(__file__), 'Corpora/COCA General.csv') else: #file locations passed as a parameter to the construct self.fnameNGSL = paths[0] self.fnameNAWL = paths[1] self.fnameBSL = paths[2] self.fnameTSL = paths[3] self.fnameCOCAAcad = paths[4] self.fnameCOCATech = paths[5] self.fnameCOCAGen = paths[6] ## Taken by Vishal's code. self.NGSLTotal = 273613534 self.NAWLTotal = 288176225 self.TSLTotal = 1560194 self.BSLTotal = 64651722 self.COCAAcadTotal = 120032441 # read the corpora self.read_corpora() self.nlp = spacy.load("en_core_web_sm")
from typing import Tuple, List, Iterable, Optional, Dict, Callable, Any from spacy.scorer import PRFScore from thinc.types import Floats2d import numpy from spacy.training.example import Example from thinc.api import Model, Optimizer from spacy.tokens.doc import Doc from spacy.pipeline.trainable_pipe import TrainablePipe from spacy.vocab import Vocab from spacy import Language from thinc.model import set_dropout_rate from wasabi import Printer Doc.set_extension("rel", default={}, force=True) msg = Printer() @Language.factory( "relation_extractor", requires=["doc.ents", "token.ent_iob", "token.ent_type"], assigns=["doc._.rel"], default_score_weights={ "rel_micro_p": None, "rel_micro_r": None, "rel_micro_f": None, }, ) def make_relation_extractor( nlp: Language, name: str, model: Model, *, threshold: float
) if type == 'method': target.set_extension(name, method=func, force=force) if type == 'property': if create_attribute: logger.trace(f"Creating attribute '_{name}'") target.set_extension("_" + name, default=default, force=force) target.set_extension(name, getter=func, force=force, setter=setter) return func return inner # ATTRIBUTES Doc.set_extension('id', default=None, force=force) # PROPERTIES @extend(Doc, 'property', create_attribute=True) def token_map(self: Doc): # TODO: another candidate for porting to faster code if not self._._token_map: token_map = [] for i, token in enumerate(self): token_map.extend([i] * (len(token) + (1 if token.whitespace_ else 0))) self._._token_map = token_map return self._._token_map