Пример #1
0
    def tokenize(self, text):
        r"""
        Tokenizing a sentence inserts spaces in such a way that it separates
        punctuation from words, splits up contractions, and generally does what
        a lot of natural language tools (especially parsers) expect their
        input to do.

            >>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.")
            u'Time is an illusion . Lunchtime , doubly so .'
            >>> untok = '''
            ... "Very deep," said Arthur, "you should send that in to the
            ... Reader's Digest. They've got a page for people like you."
            ... '''
            >>> tok = en_nl.tokenize(untok)
            >>> tok
            u"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''"
            >>> en_nl.untokenize(tok)
            u'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."'
            >>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip()
            True

        """
        step0 = preprocess_text(text).replace('\r', '').replace('\n', ' ')
        cur = step0.replace(" '", " ` ").replace("'", " '").replace("n 't",
        " n't").replace("cannot", "can not")
        for regex, replacement in compiled_tokenizer_regexes:
            cur = regex.sub(replacement, cur)
        return cur.strip()
Пример #2
0
    def canonicalize(self, word):
        """
        Reduce equivalent characters to a canonical form.

        In a EuroNL, by default, this puts those characters in lowercase.
        """
        return preprocess_text(word).lower()
Пример #3
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        text = preprocess_text(text).lower()
        n_chunks = (len(text) + 1024) // 1024
        results = []
        for chunk in xrange(n_chunks):
            chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode(
                self.mecab_encoding)
            self.mecab.stdin.write(chunk_text + '\n')
            #self.input_log.write(text+'\n')
            out_line = ''
            while True:
                out_line = self.mecab.stdout.readline()
                #self.output_log.write(out_line)
                out_line = out_line.decode(self.mecab_encoding)

                if out_line == u'EOS\n':
                    break

                word, info = out_line.strip(u'\n').split(u'\t')
                record = [word] + info.split(u',')

                # special case for detecting nai -> n
                if record[0] == u'ん' and record[5] == u'不変化型':
                    record[7] = record[1] = u'ない'

                results.append(record)
        return results
Пример #4
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        text = preprocess_text(text).lower()
        n_chunks = (len(text)+1024)//1024
        results = []
        for chunk in xrange(n_chunks):
            chunk_text = text[chunk*1024:(chunk+1)*1024].encode(self.mecab_encoding)
            self.mecab.stdin.write(chunk_text+'\n')
            #self.input_log.write(text+'\n')
            out_line = ''
            while True:
                out_line = self.mecab.stdout.readline()
                #self.output_log.write(out_line)
                out_line = out_line.decode(self.mecab_encoding)

                if out_line == u'EOS\n':
                    break

                word, info = out_line.strip(u'\n').split(u'\t')
                record = [word] + info.split(u',')
                
                # special case for detecting nai -> n
                if record[0] == u'ん' and record[5] == u'不変化型':
                    record[7] = record[1] = u'ない'

                results.append(record)
        return results
Пример #5
0
    def canonicalize(self, word):
        """
        Reduce equivalent characters to a canonical form.

        In a EuroNL, by default, this puts those characters in lowercase.
        """
        return preprocess_text(word).lower()
Пример #6
0
    def tokenize(self, text):
        r"""
        Tokenizing a sentence inserts spaces in such a way that it separates
        punctuation from words, splits up contractions, and generally does what
        a lot of natural language tools (especially parsers) expect their
        input to do.

            >>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.")
            u'Time is an illusion . Lunchtime , doubly so .'
            >>> untok = '''
            ... "Very deep," said Arthur, "you should send that in to the
            ... Reader's Digest. They've got a page for people like you."
            ... '''
            >>> tok = en_nl.tokenize(untok)
            >>> tok
            u"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''"
            >>> en_nl.untokenize(tok)
            u'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."'
            >>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip()
            True

        """
        step0 = preprocess_text(text).replace('\r', '').replace('\n', ' ')
        cur = step0.replace(" '", " ` ").replace("'", " '").replace(
            "n 't", " n't").replace("cannot", "can not")
        for regex, replacement in compiled_tokenizer_regexes:
            cur = regex.sub(replacement, cur)
        return cur.strip()
Пример #7
0
def read_csv_columns(csv_file, columns, header = True):
    reader = csv.reader(codecs.open(csv_file, 'r', 'latin-1'), delimiter=',')
    if header:
        next(reader)
    data = []
    for row in reader:
        fields = []
        for c in columns:
            txt = re.sub(r'^"|"$','',row[c]).decode('unicode-escape')
            try:
                txt = txt.decode('unicode-escape')
            except: pass
            unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore')
            fields.append(simplenlp.preprocess_text(txt))
        data.append(fields)
    return np.array(data)
Пример #8
0
def read_csv_columns(csv_file, columns, header = True):
    reader = csv.reader(codecs.open(csv_file, 'r', 'latin-1'), delimiter=',')
    if header:
        next(reader)
    data = []
    for row in reader:
        fields = []
        for c in columns:
            txt = re.sub(r'^"|"$','',row[c]).decode('unicode-escape')
            try:
                txt = txt.decode('unicode-escape')
            except: pass
            unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore')
            fields.append(simplenlp.preprocess_text(txt))
        data.append(fields)
    return np.array(data)
Пример #9
0
def test_preprocess():
    assert simplenlp.preprocess_text(
        'This is a\000 test.\r\n') == 'This is a test. '
Пример #10
0
def test_preprocess():
    assert simplenlp.preprocess_text("This is a\000 test.\r\n") == "This is a test. "