예제 #1
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)
        if self.language == 'th':
            sentences = []
            text = preprocess(text)
            tokens = tokenize(text, engine='deepcut', remove_whitespace=True)
            tokens = [token for token in tokens if not token.startswith('WS')]
            pos = pos_tag(tokens, corpus='orchid_ud')
            sentences.append({
                "words": tokens,
                "lemmas": tokens,
                "POS": [_pos[1] for _pos in pos]
            })
            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)
            return doc
        else: 
            if spacy_model is not None:
                spacy_model = fix_spacy_for_french(spacy_model)
                spacy_doc = spacy_model(text)
            else:
                max_length = kwargs.get('max_length', 10**6)
                nlp = spacy.load(self.language,
                                max_length=max_length,
                                disable=['ner', 'textcat', 'parser'])
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
                nlp = fix_spacy_for_french(nlp)
                spacy_doc = nlp(text)

            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    # FIX : This is a fallback if `fix_spacy_for_french` does not work
                    "POS": [token.pos_ or token.tag_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                        for token in sentence]
                })

            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)

            return doc
예제 #2
0
    def read(self, path, **kwargs):
        sentences = []
        tree = etree.parse(path, self.parser)
        for sentence in tree.iterfind('./document/sentences/sentence'):
            # get the character offsets
            starts = [
                int(u.text)
                for u in sentence.iterfind("tokens/token/CharacterOffsetBegin")
            ]
            ends = [
                int(u.text)
                for u in sentence.iterfind("tokens/token/CharacterOffsetEnd")
            ]
            sentences.append({
                "words":
                [u.text for u in sentence.iterfind("tokens/token/word")],
                "lemmas":
                [u.text for u in sentence.iterfind("tokens/token/lemma")],
                "POS": [u.text for u in sentence.iterfind("tokens/token/POS")],
                "char_offsets": [(starts[k], ends[k])
                                 for k in range(len(starts))]
            })
            sentences[-1].update(sentence.attrib)

        doc = Document.from_sentences(sentences, input_file=path, **kwargs)

        return doc
예제 #3
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """

        max_length = kwargs.get('max_length', 10**6)

        if self.language in RawTextReader.nlps:
            nlp = RawTextReader.nlps[self.language]
        else:
            nlp = spacy.load(self.language,
                            max_length=max_length)

        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
예제 #4
0
파일: readers.py 프로젝트: jordanlmx/pke
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
        """

        nlp = spacy.load(self.language)
        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
예제 #5
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Spacy model selection: By default this function will load the spacy
        model that is closest to the `language` parameter ('fr' language will
        load the spacy model linked to 'fr' or any 'fr_core_web_*' available
        model). In order to select the model that will be used please provide a
        preloaded model via the `spacy_model` parameter, or link the model you
        wish to use to the corresponding language code
        `python3 -m spacy link spacy_model lang_code`.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)

        if spacy_model is None:
            max_length = kwargs.get('max_length', 10**6)
            try:
                spacy_model = spacy.load(str2spacy(self.language),
                                         max_length=max_length,
                                         disable=['ner', 'textcat', 'parser'])
            except OSError:
                logging.warning('No spacy model for \'{}\' language.'.format(
                    self.language))
                logging.warning(
                    'Falling back to using english model. There might '
                    'be tokenization and postagging errors. A list of available '
                    'spacy model is available at https://spacy.io/models.'.
                    format(self.language))
                spacy_model = spacy.load(str2spacy('en_core_web_sm'),
                                         max_length=max_length,
                                         disable=['ner', 'textcat', 'parser'])
            spacy_model.add_pipe(spacy_model.create_pipe('sentencizer'))

        spacy_model = fix_spacy_for_french(spacy_model)
        spacy_doc = spacy_model(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                # FIX : This is a fallback if `fix_spacy_for_french` does not work
                "POS": [token.pos_ or token.tag_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
예제 #6
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """
        if self.language != 'id':
            max_length = kwargs.get('max_length', 10**6)
            nlp = spacy.load(self.language, max_length=max_length)
            spacy_doc = nlp(text)
            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    "POS": [token.pos_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
                })

        else:
            text = text.lower()
            token_words = [
                tokenizer_words.tokenize(t) for t in sent_tokenize(text)
            ]
            token_lemmas = []
            token_pos = ct.tag_sents(token_words)

            for token in token_words:
                temp = []
                for word in token:
                    temp.append(stemmer.stem(word))
                token_lemmas.append(temp)

            sentences = []
            for idx, _ in enumerate(token_words):
                sentences.append({
                    "words": token_words[idx],
                    "lemmas": token_lemmas[idx],
                    "POS": token_pos[idx],
                })
        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
예제 #7
0
파일: readers.py 프로젝트: tc64/pke
    def read(self, sdoc, **kwargs):
        sentences = []
        for sentence_id, sentence in enumerate(sdoc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)
        return doc
예제 #8
0
    def read(self, text, **kwargs):
        obj = json.loads(text)

        sentences = []
        for sentence_id, s in enumerate(obj['sents']):
            sentences.append({
                "words": [u['t'] for u in s['tok']],
                "lemmas": [u.get('l', '') for u in s['tok']],
                "POS": [u['p'] for u in s['tok']],
                "char_offsets": [(u['o'], u['o'] + len(u['t'])) for u in s['tok']]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
예제 #9
0
파일: readers.py 프로젝트: praetr/pke
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)

        if spacy_model is not None:
            spacy_model = fix_spacy_for_french(spacy_model)
            spacy_doc = spacy_model(text)
        else:
            max_length = kwargs.get('max_length', 10**6)
            nlp = spacy.load("en_core_web_sm",
                             max_length=max_length,
                             disable=['ner', 'textcat', 'parser'])
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
            nlp = fix_spacy_for_french(nlp)
            spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                # FIX : This is a fallback if `fix_spacy_for_french` does not work
                "POS": [token.pos_ or token.tag_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
예제 #10
0
   def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Spacy model selection: By default this function will load the spacy
        model that is closest to the `language` parameter ('fr' language will
        load the spacy model linked to 'fr' or any 'fr_core_web_*' available
        model). In order to select the model that will be used please provide a
        preloaded model via the `spacy_model` parameter, or link the model you
        wish to use to the corresponding language code
        `python3 -m spacy link spacy_model lang_code`.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        sentenceList = []
        for line in StringIO(text):
            line = line.strip()
            tmp = line.split('<phrase>')
            entityMentions = []
            if len(tmp) <= 2:
                #no phrase
                other_parts = tmp[0].split(' ')
                if(other_parts is not None):
                    while('' in other_parts):
                        other_parts.remove('')
                    entityMentions += other_parts
            for seg in tmp:
                temp2 = seg.split('</phrase>')
                if (len(temp2) > 1):
                    entityMentions.append((' ').join(temp2[0].split(' ')))
                    if (temp2[1] != ''):
                        other_parts = temp2[1].split(' ')
                        if(other_parts is not None):
                            while('' in other_parts):
                                other_parts.remove('')
                            entityMentions += other_parts
                elif temp2[0] != ' ' and temp2[0] != '':
                    other_parts = temp2[0].split(' ')
                    if(other_parts is not None):
                        while('' in other_parts):
                            other_parts.remove('')
                        entityMentions += other_parts
            sentenceList.append(entityMentions)

        nlp.spacy.load('en')
        nlp.tokenizer = nlp.tokenizer.tokens_from_list
        for spacy_doc in spacy.pipe(sentenceList):
            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    # FIX : This is a fallback if `fix_spacy_for_french` does not work
                    "POS": [token.pos_ or token.tag_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                        for token in sentence]
                })

            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)

        return doc