def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). spacy_model (model): an already loaded spacy model. """ spacy_model = kwargs.get('spacy_model', None) if self.language == 'th': sentences = [] text = preprocess(text) tokens = tokenize(text, engine='deepcut', remove_whitespace=True) tokens = [token for token in tokens if not token.startswith('WS')] pos = pos_tag(tokens, corpus='orchid_ud') sentences.append({ "words": tokens, "lemmas": tokens, "POS": [_pos[1] for _pos in pos] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc else: if spacy_model is not None: spacy_model = fix_spacy_for_french(spacy_model) spacy_doc = spacy_model(text) else: max_length = kwargs.get('max_length', 10**6) nlp = spacy.load(self.language, max_length=max_length, disable=['ner', 'textcat', 'parser']) nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp = fix_spacy_for_french(nlp) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], # FIX : This is a fallback if `fix_spacy_for_french` does not work "POS": [token.pos_ or token.tag_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc
def read(self, path, **kwargs): sentences = [] tree = etree.parse(path, self.parser) for sentence in tree.iterfind('./document/sentences/sentence'): # get the character offsets starts = [ int(u.text) for u in sentence.iterfind("tokens/token/CharacterOffsetBegin") ] ends = [ int(u.text) for u in sentence.iterfind("tokens/token/CharacterOffsetEnd") ] sentences.append({ "words": [u.text for u in sentence.iterfind("tokens/token/word")], "lemmas": [u.text for u in sentence.iterfind("tokens/token/lemma")], "POS": [u.text for u in sentence.iterfind("tokens/token/POS")], "char_offsets": [(starts[k], ends[k]) for k in range(len(starts))] }) sentences[-1].update(sentence.attrib) doc = Document.from_sentences(sentences, input_file=path, **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). """ max_length = kwargs.get('max_length', 10**6) if self.language in RawTextReader.nlps: nlp = RawTextReader.nlps[self.language] else: nlp = spacy.load(self.language, max_length=max_length) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. """ nlp = spacy.load(self.language) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Spacy model selection: By default this function will load the spacy model that is closest to the `language` parameter ('fr' language will load the spacy model linked to 'fr' or any 'fr_core_web_*' available model). In order to select the model that will be used please provide a preloaded model via the `spacy_model` parameter, or link the model you wish to use to the corresponding language code `python3 -m spacy link spacy_model lang_code`. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). spacy_model (model): an already loaded spacy model. """ spacy_model = kwargs.get('spacy_model', None) if spacy_model is None: max_length = kwargs.get('max_length', 10**6) try: spacy_model = spacy.load(str2spacy(self.language), max_length=max_length, disable=['ner', 'textcat', 'parser']) except OSError: logging.warning('No spacy model for \'{}\' language.'.format( self.language)) logging.warning( 'Falling back to using english model. There might ' 'be tokenization and postagging errors. A list of available ' 'spacy model is available at https://spacy.io/models.'. format(self.language)) spacy_model = spacy.load(str2spacy('en_core_web_sm'), max_length=max_length, disable=['ner', 'textcat', 'parser']) spacy_model.add_pipe(spacy_model.create_pipe('sentencizer')) spacy_model = fix_spacy_for_french(spacy_model) spacy_doc = spacy_model(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], # FIX : This is a fallback if `fix_spacy_for_french` does not work "POS": [token.pos_ or token.tag_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get( 'input_file', None), **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). """ if self.language != 'id': max_length = kwargs.get('max_length', 10**6) nlp = spacy.load(self.language, max_length=max_length) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) else: text = text.lower() token_words = [ tokenizer_words.tokenize(t) for t in sent_tokenize(text) ] token_lemmas = [] token_pos = ct.tag_sents(token_words) for token in token_words: temp = [] for word in token: temp.append(stemmer.stem(word)) token_lemmas.append(temp) sentences = [] for idx, _ in enumerate(token_words): sentences.append({ "words": token_words[idx], "lemmas": token_lemmas[idx], "POS": token_pos[idx], }) doc = Document.from_sentences(sentences, input_file=kwargs.get( 'input_file', None), **kwargs) return doc
def read(self, sdoc, **kwargs): sentences = [] for sentence_id, sentence in enumerate(sdoc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get( 'input_file', None), **kwargs) return doc
def read(self, text, **kwargs): obj = json.loads(text) sentences = [] for sentence_id, s in enumerate(obj['sents']): sentences.append({ "words": [u['t'] for u in s['tok']], "lemmas": [u.get('l', '') for u in s['tok']], "POS": [u['p'] for u in s['tok']], "char_offsets": [(u['o'], u['o'] + len(u['t'])) for u in s['tok']] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). spacy_model (model): an already loaded spacy model. """ spacy_model = kwargs.get('spacy_model', None) if spacy_model is not None: spacy_model = fix_spacy_for_french(spacy_model) spacy_doc = spacy_model(text) else: max_length = kwargs.get('max_length', 10**6) nlp = spacy.load("en_core_web_sm", max_length=max_length, disable=['ner', 'textcat', 'parser']) nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp = fix_spacy_for_french(nlp) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], # FIX : This is a fallback if `fix_spacy_for_french` does not work "POS": [token.pos_ or token.tag_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get( 'input_file', None), **kwargs) return doc
def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Spacy model selection: By default this function will load the spacy model that is closest to the `language` parameter ('fr' language will load the spacy model linked to 'fr' or any 'fr_core_web_*' available model). In order to select the model that will be used please provide a preloaded model via the `spacy_model` parameter, or link the model you wish to use to the corresponding language code `python3 -m spacy link spacy_model lang_code`. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). spacy_model (model): an already loaded spacy model. """ sentenceList = [] for line in StringIO(text): line = line.strip() tmp = line.split('<phrase>') entityMentions = [] if len(tmp) <= 2: #no phrase other_parts = tmp[0].split(' ') if(other_parts is not None): while('' in other_parts): other_parts.remove('') entityMentions += other_parts for seg in tmp: temp2 = seg.split('</phrase>') if (len(temp2) > 1): entityMentions.append((' ').join(temp2[0].split(' '))) if (temp2[1] != ''): other_parts = temp2[1].split(' ') if(other_parts is not None): while('' in other_parts): other_parts.remove('') entityMentions += other_parts elif temp2[0] != ' ' and temp2[0] != '': other_parts = temp2[0].split(' ') if(other_parts is not None): while('' in other_parts): other_parts.remove('') entityMentions += other_parts sentenceList.append(entityMentions) nlp.spacy.load('en') nlp.tokenizer = nlp.tokenizer.tokens_from_list for spacy_doc in spacy.pipe(sentenceList): sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], # FIX : This is a fallback if `fix_spacy_for_french` does not work "POS": [token.pos_ or token.tag_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get('input_file', None), **kwargs) return doc