Пример #1
0
    def __index_content(url_id, db, soup):
        title = soup.title.text
        if title is not None:
            if isinstance(title, basestring):
                title.encode('utf8')
            else:
                unicode(title).encode('utf8')

        content = soup.find("div", {"id": "mw-content-text"}).text

        if isinstance(content, basestring):
            content.encode('utf8')
        else:
            unicode(content).encode('utf8')

        # content = soup.text

        custom_tokenizer = PunktSentenceTokenizer()
        tokenized_sentences = custom_tokenizer.tokenize(unicode(content))

        page = dict()
        page["title"] = title
        hints_list = list()
        try:
            for sentence in tokenized_sentences:
                words = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(words)

                grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>}
                {<NNP>+}"""
                chunk_parser = nltk.RegexpParser(grammar)
                chunked = chunk_parser.parse(tagged)
                for chunk in chunked.subtrees():
                    if chunk.label() == "NP":
                        line = list()
                        for each in chunk.leaves():
                            if len(each[0]) > 2:
                                line.append(each[0])
                        if len(line) > 0:
                            final_value = (" ".join(line)).lower()
                            hints_list.append(final_value)
                page["hints"] = hints_list

        except Exception as e:
            print(str(e))
        db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}})

        page_content_size = len(page["hints"])
        print(colored("\t\tUpdated With Indexed Content", "yellow"))

        # current_dir = os.getcwd()
        # files_dir = current_dir + "/Originals/"
        # file_name = url_id
        # file_path = files_dir + str(file_name)
        # created_file = open(file_path, "w")
        # created_file.write(content.encode("utf-8"))
        # created_file.close()
        # print("\t\tOriginal Content Is Saved")
        return page_content_size
        return
Пример #2
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
def pre_segment(doc):
    """Set sentence boundaries with nltk instead of spacy."""
    if len(str(doc.text).split()) > 3:
        tokenizer = PunktSentenceTokenizer(doc.text)
        sentences = tokenizer.tokenize(doc.text)
        for nltk_sentence in sentences:
            words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence)
            for i in range(len(doc) - len(words) + 1):
                token_list = [str(token) for token in doc[i:i + len(words)]]
                if token_list == words:
                    doc[i].is_sent_start = True
                    for token in doc[i + 1:i + len(words)]:
                        token.is_sent_start = False
    return doc
def get_nltk_sents(txt: str,
                   tokenizer: nltk.PunktSentenceTokenizer,
                   extra_abbreviations: Set[str] = None) -> List[str]:
    if extra_abbreviations is not None:
        tokenizer._params.abbrev_types.update(extra_abbreviations)

    return tokenizer.tokenize(txt)
Пример #5
0
def sentence_tokenizer(text):
    """
    Tokenizes sentences.

    :param text:
    :return: list of sentences (a sentence is a string)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = {
        'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv',
        'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co',
        'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21'
    }
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
Пример #6
0
class LanguageModel:
    """
    N-gram model
    """
    def __init__(self, n_gram=2, missed_value=0.99):
        """

        :param n_gram: length of n-gram
        :param missed_value: default value for all unseen n-gram
        """
        self.n = n_gram
        self.n_grams = {}
        self.context = {}
        self.sentence_tokenizer = SentenceTokenizer()
        self.tokenizer = Tokenizer()
        self.missed_value = missed_value

    def build_model(self, text):
        sentenses = self.sentence_tokenizer.tokenize(text)
        words = [
            list(
                filter(
                    lambda s: s.isalpha(),
                    self.tokenizer.tokenize(sentence.strip())
                )
            ) for sentence in sentenses
        ]
        for sentence in words:
            if len(sentence) < self.n:
                key = " ".join(sentence)
                self.context.update({key: self.context.get(key, 0) + 1})
            else:
                for i in range(len(sentence) - self.n + 1):
                    context_key = " ".join(sentence[i:i + self.n - 1])
                    n_gram_key = " ".join(sentence[i:i + self.n])
                    self.context.update({context_key: self.context.get(context_key, 0) + 1})
                    self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1})

    def calculate_proba(self, sentence):
        words = list(
            filter(
                lambda s: s.isalpha(),
                self.tokenizer.tokenize(sentence.strip())
            )
        )
        result = 1
        for i in range(min(self.n - 2, len(words) - 1), len(words)):
            if i < self.n - 1:
                size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1])
                result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size
            elif i > self.n - 2:
                context_key = " ".join(words[i-self.n+1:i])
                n_gram_key = " ".join(words[i-self.n+1:i+1])
                context_val = self.context.get(context_key, self.missed_value)
                n_gram_val = self.n_grams.get(n_gram_key, self.missed_value)
                p = n_gram_val / context_val
                result *= p
        return result
Пример #7
0
 def handle(self, *app_labels, **options):
     print app_labels
     print options
     for article in BwogArticle.objects.all():
         sentence_tokenizer = PunktSentenceTokenizer()
         sentences = sentence_tokenizer.tokenize(article.body)
         for sentence_index in range(len(sentences)):
             sentence = sentences[sentence_index]
             sentence_words = nltk.word_tokenize(sentence)
             tagged = nltk.pos_tag(sentence_words)
             for tup_index in range(len(tagged)):
                 tup = tagged[tup_index]
                 article_word = tup[0]
                 article_tag = tup[1]
                 p = ParsedItem(content_object=article, word=article_word, tag=article_tag,
                                sentence_sequence=sentence_index, word_sequence=tup_index)
                 p.save()
                 print p
Пример #8
0
class NLTKSentenceSegmenter(PackProcessor):
    r"""A wrapper of NLTK sentence tokenizer.
    """
    def __init__(self):
        super().__init__()
        self.sent_splitter = PunktSentenceTokenizer()

    def _process(self, input_pack: DataPack):
        for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
            Sentence(input_pack, begin, end)
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
            # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}"""
            # # chunkGram = r"""Chunk: {<.*>+}
            # #                     }<VB.?|IN|DT>+{"""
            # chunkParser = nltk.RegexpParser(chunkGram)
            # chunked = chunkParser.parse(tagged)
            # print(chunked)
            # #print(tagged)
    except Exception as e:
        print(str(e))
    return namedEnt
Пример #10
0
    def __init__(self, n_gram=2, missed_value=0.99):
        """

        :param n_gram: length of n-gram
        :param missed_value: default value for all unseen n-gram
        """
        self.n = n_gram
        self.n_grams = {}
        self.context = {}
        self.sentence_tokenizer = SentenceTokenizer()
        self.tokenizer = Tokenizer()
        self.missed_value = missed_value
Пример #11
0
def sentence_split(input_text):
    input_text = "<root>" + input_text + "</root>"

    soup = BeautifulSoup(input_text, "xml")
    paragraphs = []
    for doc in soup.find('root').findAll('DOC'):
        if doc['type'] == 'story':
            headlines = doc('HEADLINE')
            for h in headlines:
                paragraphs.append(h.contents[0])
            p_blocks = doc.find('TEXT').findAll('P')
            for p in p_blocks:
                paragraphs.append(p.contents[0])
        elif doc['type'] == 'multi':
            paragraphs.append(doc.find('TEXT').contents[0])

    sentences = []
    punkt = PunktSentenceTokenizer()
    for parag in paragraphs:
        for sent in punkt.sentences_from_text(parag, realign_boundaries=True):
            sentences.append(replace.sub(' ', sent).strip())
    return sentences
Пример #12
0
def sent_tokenize(text):
    model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl')
    with open(model_path, 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc',
        'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i',
        '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts',
        'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r',
        'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k',
        'e.l', 'o.t', 's.a'
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    sentences = tokenizer.sentences_from_text(text)
    return sentences
Пример #13
0
class SentenceToVec(BaseEstimator, TransformerMixin):

    def __init__(self, stop_words, vector_len=1000):
        self.vocab = []
        self.stop_words = stop_words
        self.vector_len = vector_len

        self.tokenizer = PunktSentenceTokenizer()

    def format_word(self, word):
        if word.isdigit():
            return "0"
        elif word in self.stop_words:
            return ""
        else:
            return word.strip()

    def tokenize(self, sentence):
        res_tokens = []
        tokens_temp = self.tokenizer.tokenize(sentence)
        for tokens in tokens_temp:
            tokens = nltk.word_tokenize(tokens)
            tokens = [self.format_word(t) for t in tokens]
            res_tokens += [t for t in tokens if t]
        return res_tokens

    def fit(self, X, y=None):
        self.vocab = []
        word_freq = Counter()
        for i in range(X.shape[0]):
            for w in self.tokenize(X[i]):
                if w not in self.stop_words:
                    word_freq[w] += 1

        for term, freq in word_freq.most_common():
            if len(self.vocab) < self.vector_len:
                self.vocab.append(term)
        return self

    def _vectorize(self, words):
        freq = dict(Counter(words))
        vector = []
        for v in self.vocab:
            vector.append(freq[v] if v in words else 0)
        return np.array(vector)

    def transform(self, X, copy=True):
        _X = np.zeros((X.shape[0], len(self.vocab)))
        for i in range(X.shape[0]):
            _X[i] = self._vectorize(self.tokenize(X[i]))
        return _X
Пример #14
0
class CoreNLP:
    def __init__(self):
        self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
        self.sentence_tokenizer = PunktSentenceTokenizer()

    @staticmethod
    def corenlp_server():
        return getenv('CORENLP_SERVER')

    def dep_parse(self, text: str, conll_version=10) -> str:
        """Get a CoreNLP depparse,lemma"""
        def get_conll(t):
            deps, = self.parser.raw_parse(t)
            return deps.to_conll(conll_version)  # xrenner requires conll10

        sentences = self.sentence_tokenizer.sentences_from_text(text)
        return '\n'.join(map(get_conll, sentences))
Пример #15
0
def _extract_text_from_elements(elements: Element, punkt: bool,
                                keep_xml: bool) -> List[str]:
    examples = []
    if keep_xml:
        for e in elements:
            xml_str = tostring(e).decode('utf-8')  # tostring returns bytes
            length = len(innertext(e))
            if length > config.min_char_length:
                examples.append(xml_str)
    else:
        for e in elements:
            text = innertext(e)
            if punkt:
                sentences = PunktSentenceTokenizer().tokenize(text=text)
                filtered_sentences = [s for s in sentences if self._filter(s)]
                examples += filtered_sentences
            else:
                if _filter(text):
                    examples.append(text)
    return examples
def _load_model():
    global sentence_tokenizer
    if sentence_tokenizer is not None:
        return
    model_path = join(dirname(__file__), 'st_kiss-strunk-2006_2019_01_13.pkl')
    with open(model_path, 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc',
        'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i',
        '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts',
        'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r',
        'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k',
        'e.l', 'o.t', 's.a'
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    sentence_tokenizer = PunktSentenceTokenizer(punkt_param)
Пример #17
0
class NLTKSentenceSegmenter(PackProcessor):
    r"""A wrapper of NLTK sentence tokenizer."""
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        nltk.download("punkt")

    def __init__(self):
        super().__init__()
        self.sent_splitter = PunktSentenceTokenizer()

    def _process(self, input_pack: DataPack):
        for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
            Sentence(input_pack, begin, end)

    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of `NLTKSentenceSegmenter`, which
        is `ft.onto.base_ontology.Sentence`
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        record_meta["ft.onto.base_ontology.Sentence"] = set()
Пример #18
0
from string import translate, maketrans, punctuation
from nltk import PunktSentenceTokenizer, PorterStemmer
from textblob import TextBlob
from bs4 import BeautifulSoup as BS
import multiprocessing
from textwrap import dedent
from itertools import izip_longest
from itertools import chain, combinations_with_replacement
import urllib2

#for punctuation
pp = punctuation
del (punctuation)
T = maketrans(pp, ' ' * len(pp))
tknr = PunktSentenceTokenizer()

#download training data from this dude's githubrepo
url = "https://raw.githubusercontent.com/rhasan/nltk-try/532e51035b509c10b08bef4666307a37ca5409ec/ngram/simple_wikipedia_plaintext.txt"
req = urllib2.Request(url)
raw = urllib2.urlopen(req).read().split('\n')
raw = list(chain(*[x.split('.').strip().lower() for x in raw if x != '']))
raw = [removeNonAscii(x) for x in raw]
raw = [x for x in raw if len(x) > 1]

with open('train_sentences.txt', 'wU') as f:
    for line in raw:
        f.write(line + '\n')

del (raw)
Пример #19
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 12:49:39 2020

@author: alex.a.murray
"""
import nltk 
from nltk.corpus import state_union
from nltk import PunktSentenceTokenizer


train_text = state_union.raw("2005-GWBush.text")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
            print(str(e))
            
process_content
Пример #20
0
 def tokenize_to_sentences(self, paragraph):
     tokenizer = PunktSentenceTokenizer()
     sentences = tokenizer.tokenize(paragraph)
     return sentences
Пример #21
0
import sys
import re

from nltk import word_tokenize, pos_tag, PunktSentenceTokenizer, tag

if len(sys.argv) < 2:
    raw = sys.stdin.read()
else:
    f = open(sys.argv[1])
    raw = f.read()

lines = lib.get_dat_sgml(raw)

sys.stderr.write(str(len(lines)) + " entries\n")

p = PunktSentenceTokenizer()

for i in range(len(lines)):
    if i % 100 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if not ("EKYWD" in line and "EABST" in line):
        continue
    abstract = line["EABST"]
    abstract = p.tokenize(abstract)
    abstract = [word_tokenize(sentence) for sentence in abstract]
    keywords = re.split("\t", line["EKYWD"])
    keywords = [word_tokenize(keyword) for keyword in keywords]
    for sentence in abstract:
        pos_sentence = pos_tag(sentence)
        pos_sentence = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_sentence]
Пример #22
0
Файл: gpos.py Проект: wenh81/ner
for line in lines:
    line_dict = dict(line)
    if "EKYWD" in line_dict and "EABST" in line_dict:
        keywords = re.split("\t", line_dict["EKYWD"])
        all_keywords.update(set(keywords))

sys.stderr.write("Tokenize keywords\n")

keywords = []

for keyword in all_keywords:
    keywords.append(word_tokenize(keyword))

sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n")

p = PunktSentenceTokenizer()

for i in range(len(lines)):
    if i % 10 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if not ("EKYWD" in line and "EABST" in line):
        continue
    abstract = line["EABST"]
    abstract = p.tokenize(abstract)
    abstract = [word_tokenize(sentence) for sentence in abstract]
    for sentence in abstract:
        j = 0
        while j < len(sentence):
            found = False
            for k in range(len(keywords)):
#Representing the words with their Parts of Speech
import nltk
from nltk.corpus import state_union
''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer 
It comes with pretraining and we can also further train it '''
from nltk import PunktSentenceTokenizer

train = state_union.raw("2005-GWBush.txt")
text = state_union.raw("2006-GWBush.txt")
SentenceTokenizer = PunktSentenceTokenizer(train)

tokenized = SentenceTokenizer.tokenize(text)


def process():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))


process()
Пример #24
0
class McChineseTokenizer(object):
    """Chinese language tokenizer that uses jieba."""

    # Path to jieba dictionary(ies)
    __dict_path = os.path.join(mc_root_path(),
                               'lib/MediaWords/Languages/resources/zh/')
    __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big')
    __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt')

    # jieba instance
    __jieba = None

    # Text -> sentence tokenizer for Chinese text
    __chinese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Chinese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Chinese (e.g. English) text
    __non_chinese_sentence_tokenizer = PunktSentenceTokenizer()

    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Chinese sentence into words.
        
        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass
        return words
Пример #25
0
def get_sentence_tokenizer():
    # https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    return PunktSentenceTokenizer()
Пример #26
0
 def __init__(self):
     super().__init__()
     self.sent_splitter = PunktSentenceTokenizer()
Пример #27
0
 def __init__(self):
     self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
     self.sentence_tokenizer = PunktSentenceTokenizer()
Пример #28
0
from nltk.corpus import stopwords
import nltk

example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!"
sen_list = sent_tokenize(example)
sen = sen_list[2]
print(sen)
stop_words = set(stopwords.words('english'))
'''words = word_tokenize(sen)
filtered_words = []
for w in words:
    if w not in stop_words:                 tokenizing
        filtered_words.append(w)
print(filtered_words)
'''
tokenize = PunktSentenceTokenizer(sen)
tokenized = tokenize.tokenize(sen)  # Speech tagging
print(tokenized)
for i in tokenized:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    # Chunking
    '''
    using regex  here . means select all characters
    ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net
    '''
    chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """  # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk
    chunkparser = nltk.RegexpParser(chunkgram)
    chunked = chunkparser.parse(tagged)
    print(chunked)
Пример #29
0
sentences = sent_tokenize(example_text)

for w in words:
    print(w)

print()

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer and training it
train_text = state_union.raw("2005-GWBush.txt")

custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text)

sentences = custom_sentence_tokenizer_trained.tokenize(example_text)

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer with no training (it comes pretrained)
custom_sentence_tokenizer_untrained = PunktSentenceTokenizer()

sentences = custom_sentence_tokenizer_untrained.tokenize(example_text)

for s in sentences:
    print(s)
Пример #30
0
import nltk
from nltk import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer.train(train_text=train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            named_entity = nltk.ne_chunk(tagged, binary=True)
            named_entity.draw()
    except Exception as e:
        print(str(e))
Пример #31
0
from nltk import PunktSentenceTokenizer, WordPunctTokenizer
from collections import Counter

vocab_size = 1000

sentTokenier = PunktSentenceTokenizer()
wordTokenizer = WordPunctTokenizer()

filename = 'data/formatted_movie_lines.txt'
string = open(filename, mode='r', encoding='utf8').read()
string = string.replace("'t", "")
string = string.replace("'s", "")

words = wordTokenizer.tokenize(string)
sentences = set(sentTokenier.tokenize(string))

vocab = Counter(words).most_common(vocab_size)
dict = Counter(vocab)
sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences]

new_sentences = []
with open("lines.txt", mode='w', encoding='utf8') as file:
    for sentence in sentences:
        write = True
        for word in sentence:
            if word in dict.keys():
                write = False
                break
        if write:
            file.writelines(" ".join(sentence) + "\n")
            new_sentences.append(sentence)
Пример #32
0
PATTERN = re.compile(
    r'''(?x)   # set flag to allow verbose regexps: ignores spaces and newlines
        (?:[A-Z]\.)+            # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, e.g. $12.40, 82%
        | '(?:s|nt)\b           # 's, 'nt
        #     | \w+(?:-\w+)*    # words with optional internal hyphens
        | [a-zA-Z0-9]+          # words without internal hyphens e.g. Type1_gene(Attention: \w contains _)
        | \.\.\.                # ellipsis
        | [.,;"?:]              # these are separate tokens
        | [][()_`\|\n-]+        # these tokens are grouped; includes ] and [ and -
        | [^a-zA-Z0-9_\s]       # find these character in the AGAC training data.
        ''')

# default tokenizer
word_tokenizer = RegexpTokenizer(PATTERN)
sent_tokenizer = PunktSentenceTokenizer()


class Text(object):
    """
    Abstract text. e.g a paragrah, abstract and an essay.
    """
    def __init__(self,
                 text,
                 s_tokenizer=sent_tokenizer,
                 w_tokenizer=word_tokenizer):
        """
        :param text: str
        :param s_tokenizer: sentence_tokenzier
        :param w_tokenizer: word_tokenizer
        """
Пример #33
0
class McJapaneseTokenizer(object):
    """Japanese language tokenizer that uses MeCab."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    # MeCab instance
    __mecab = None

    # Text -> sentence tokenizer for Japanese text
    __japanese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Japanese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Japanese (e.g. English) text
    __non_japanese_sentence_tokenizer = PunktSentenceTokenizer()

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    def __init__(self):
        """Initialize MeCab tokenizer."""

        mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path(
        )

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McJapaneseTokenizerException(
                "Unable to initialize MeCab: %s" % str(ex))

    @staticmethod
    def _mecab_ipadic_neologd_path(
    ) -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McJapaneseTokenizerException(
                "mecab-ipadic-neologd was not found in paths: %s" %
                str(candidate_paths))

        return mecab_dictionary_path

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Japanese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.
        
        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Japanese sentence into words.
        
        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words