示例#1
0
    def __index_content(url_id, db, soup):
        title = soup.title.text
        if title is not None:
            if isinstance(title, basestring):
                title.encode('utf8')
            else:
                unicode(title).encode('utf8')

        content = soup.find("div", {"id": "mw-content-text"}).text

        if isinstance(content, basestring):
            content.encode('utf8')
        else:
            unicode(content).encode('utf8')

        # content = soup.text

        custom_tokenizer = PunktSentenceTokenizer()
        tokenized_sentences = custom_tokenizer.tokenize(unicode(content))

        page = dict()
        page["title"] = title
        hints_list = list()
        try:
            for sentence in tokenized_sentences:
                words = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(words)

                grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>}
                {<NNP>+}"""
                chunk_parser = nltk.RegexpParser(grammar)
                chunked = chunk_parser.parse(tagged)
                for chunk in chunked.subtrees():
                    if chunk.label() == "NP":
                        line = list()
                        for each in chunk.leaves():
                            if len(each[0]) > 2:
                                line.append(each[0])
                        if len(line) > 0:
                            final_value = (" ".join(line)).lower()
                            hints_list.append(final_value)
                page["hints"] = hints_list

        except Exception as e:
            print(str(e))
        db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}})

        page_content_size = len(page["hints"])
        print(colored("\t\tUpdated With Indexed Content", "yellow"))

        # current_dir = os.getcwd()
        # files_dir = current_dir + "/Originals/"
        # file_name = url_id
        # file_path = files_dir + str(file_name)
        # created_file = open(file_path, "w")
        # created_file.write(content.encode("utf-8"))
        # created_file.close()
        # print("\t\tOriginal Content Is Saved")
        return page_content_size
        return
def get_nltk_sents(txt: str,
                   tokenizer: nltk.PunktSentenceTokenizer,
                   extra_abbreviations: Set[str] = None) -> List[str]:
    if extra_abbreviations is not None:
        tokenizer._params.abbrev_types.update(extra_abbreviations)

    return tokenizer.tokenize(txt)
示例#3
0
class LanguageModel:
    """
    N-gram model
    """
    def __init__(self, n_gram=2, missed_value=0.99):
        """

        :param n_gram: length of n-gram
        :param missed_value: default value for all unseen n-gram
        """
        self.n = n_gram
        self.n_grams = {}
        self.context = {}
        self.sentence_tokenizer = SentenceTokenizer()
        self.tokenizer = Tokenizer()
        self.missed_value = missed_value

    def build_model(self, text):
        sentenses = self.sentence_tokenizer.tokenize(text)
        words = [
            list(
                filter(
                    lambda s: s.isalpha(),
                    self.tokenizer.tokenize(sentence.strip())
                )
            ) for sentence in sentenses
        ]
        for sentence in words:
            if len(sentence) < self.n:
                key = " ".join(sentence)
                self.context.update({key: self.context.get(key, 0) + 1})
            else:
                for i in range(len(sentence) - self.n + 1):
                    context_key = " ".join(sentence[i:i + self.n - 1])
                    n_gram_key = " ".join(sentence[i:i + self.n])
                    self.context.update({context_key: self.context.get(context_key, 0) + 1})
                    self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1})

    def calculate_proba(self, sentence):
        words = list(
            filter(
                lambda s: s.isalpha(),
                self.tokenizer.tokenize(sentence.strip())
            )
        )
        result = 1
        for i in range(min(self.n - 2, len(words) - 1), len(words)):
            if i < self.n - 1:
                size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1])
                result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size
            elif i > self.n - 2:
                context_key = " ".join(words[i-self.n+1:i])
                n_gram_key = " ".join(words[i-self.n+1:i+1])
                context_val = self.context.get(context_key, self.missed_value)
                n_gram_val = self.n_grams.get(n_gram_key, self.missed_value)
                p = n_gram_val / context_val
                result *= p
        return result
示例#4
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
class SentenceToVec(BaseEstimator, TransformerMixin):

    def __init__(self, stop_words, vector_len=1000):
        self.vocab = []
        self.stop_words = stop_words
        self.vector_len = vector_len

        self.tokenizer = PunktSentenceTokenizer()

    def format_word(self, word):
        if word.isdigit():
            return "0"
        elif word in self.stop_words:
            return ""
        else:
            return word.strip()

    def tokenize(self, sentence):
        res_tokens = []
        tokens_temp = self.tokenizer.tokenize(sentence)
        for tokens in tokens_temp:
            tokens = nltk.word_tokenize(tokens)
            tokens = [self.format_word(t) for t in tokens]
            res_tokens += [t for t in tokens if t]
        return res_tokens

    def fit(self, X, y=None):
        self.vocab = []
        word_freq = Counter()
        for i in range(X.shape[0]):
            for w in self.tokenize(X[i]):
                if w not in self.stop_words:
                    word_freq[w] += 1

        for term, freq in word_freq.most_common():
            if len(self.vocab) < self.vector_len:
                self.vocab.append(term)
        return self

    def _vectorize(self, words):
        freq = dict(Counter(words))
        vector = []
        for v in self.vocab:
            vector.append(freq[v] if v in words else 0)
        return np.array(vector)

    def transform(self, X, copy=True):
        _X = np.zeros((X.shape[0], len(self.vocab)))
        for i in range(X.shape[0]):
            _X[i] = self._vectorize(self.tokenize(X[i]))
        return _X
def pre_segment(doc):
    """Set sentence boundaries with nltk instead of spacy."""
    if len(str(doc.text).split()) > 3:
        tokenizer = PunktSentenceTokenizer(doc.text)
        sentences = tokenizer.tokenize(doc.text)
        for nltk_sentence in sentences:
            words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence)
            for i in range(len(doc) - len(words) + 1):
                token_list = [str(token) for token in doc[i:i + len(words)]]
                if token_list == words:
                    doc[i].is_sent_start = True
                    for token in doc[i + 1:i + len(words)]:
                        token.is_sent_start = False
    return doc
示例#7
0
def sentence_tokenizer(text):
    """
    Tokenizes sentences.

    :param text:
    :return: list of sentences (a sentence is a string)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = {
        'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv',
        'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co',
        'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21'
    }
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
示例#8
0
 def handle(self, *app_labels, **options):
     print app_labels
     print options
     for article in BwogArticle.objects.all():
         sentence_tokenizer = PunktSentenceTokenizer()
         sentences = sentence_tokenizer.tokenize(article.body)
         for sentence_index in range(len(sentences)):
             sentence = sentences[sentence_index]
             sentence_words = nltk.word_tokenize(sentence)
             tagged = nltk.pos_tag(sentence_words)
             for tup_index in range(len(tagged)):
                 tup = tagged[tup_index]
                 article_word = tup[0]
                 article_tag = tup[1]
                 p = ParsedItem(content_object=article, word=article_word, tag=article_tag,
                                sentence_sequence=sentence_index, word_sequence=tup_index)
                 p.save()
                 print p
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
            # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}"""
            # # chunkGram = r"""Chunk: {<.*>+}
            # #                     }<VB.?|IN|DT>+{"""
            # chunkParser = nltk.RegexpParser(chunkGram)
            # chunked = chunkParser.parse(tagged)
            # print(chunked)
            # #print(tagged)
    except Exception as e:
        print(str(e))
    return namedEnt
示例#10
0
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text=train_text)
tokenized_sentences = custom_sentence_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for sentence in tokenized_sentences:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            chunk_gram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)

            print(chunked)

    except Exception as e:
示例#11
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 12:49:39 2020

@author: alex.a.murray
"""
import nltk 
from nltk.corpus import state_union
from nltk import PunktSentenceTokenizer


train_text = state_union.raw("2005-GWBush.text")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
            print(str(e))
            
process_content
示例#12
0
 def tokenize_to_sentences(self, paragraph):
     tokenizer = PunktSentenceTokenizer()
     sentences = tokenizer.tokenize(paragraph)
     return sentences
示例#13
0
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text=train_text)
tokenized = custom_sentence_tokenizer.tokenize(text=sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunk_gram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT>+{"""

            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)

            chunked.draw()
示例#14
0
文件: tpos.py 项目: melsk125/ner
    raw = f.read()

lines = lib.get_dat_sgml(raw)

sys.stderr.write(str(len(lines)) + " entries\n")

p = PunktSentenceTokenizer()

for i in range(len(lines)):
    if i % 100 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if not ("EKYWD" in line and "EABST" in line):
        continue
    abstract = line["EABST"]
    abstract = p.tokenize(abstract)
    abstract = [word_tokenize(sentence) for sentence in abstract]
    keywords = re.split("\t", line["EKYWD"])
    keywords = [word_tokenize(keyword) for keyword in keywords]
    for sentence in abstract:
        pos_sentence = pos_tag(sentence)
        pos_sentence = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_sentence]
        j = 0
        while j < len(sentence):
            found = False
            for k in range(len(keywords)):
                keyword = keywords[k]
                keyword_len = len(keyword)
                if keyword_len > 0 and keyword == sentence[j:j+keyword_len]:
                    for l in range(keyword_len):
                        this_word = keyword[l]
示例#15
0
文件: gpos.py 项目: wenh81/ner
for keyword in all_keywords:
    keywords.append(word_tokenize(keyword))

sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n")

p = PunktSentenceTokenizer()

for i in range(len(lines)):
    if i % 10 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if not ("EKYWD" in line and "EABST" in line):
        continue
    abstract = line["EABST"]
    abstract = p.tokenize(abstract)
    abstract = [word_tokenize(sentence) for sentence in abstract]
    for sentence in abstract:
        j = 0
        while j < len(sentence):
            found = False
            for k in range(len(keywords)):
                keyword = keywords[k]
                keyword_len = len(keyword)
                if keyword_len > 0 and keyword == sentence[j:j+keyword_len]:
                    for l in range(keyword_len):
                        this_word = keyword[l]
                        out = this_word + "\t"
                        if l == 0:
                            out += "B"
                        else:
示例#16
0
from nltk.corpus import stopwords
import nltk

example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!"
sen_list = sent_tokenize(example)
sen = sen_list[2]
print(sen)
stop_words = set(stopwords.words('english'))
'''words = word_tokenize(sen)
filtered_words = []
for w in words:
    if w not in stop_words:                 tokenizing
        filtered_words.append(w)
print(filtered_words)
'''
tokenize = PunktSentenceTokenizer(sen)
tokenized = tokenize.tokenize(sen)  # Speech tagging
print(tokenized)
for i in tokenized:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    # Chunking
    '''
    using regex  here . means select all characters
    ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net
    '''
    chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """  # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk
    chunkparser = nltk.RegexpParser(chunkgram)
    chunked = chunkparser.parse(tagged)
    print(chunked)
示例#17
0
from nltk import PunktSentenceTokenizer, WordPunctTokenizer
from collections import Counter

vocab_size = 1000

sentTokenier = PunktSentenceTokenizer()
wordTokenizer = WordPunctTokenizer()

filename = 'data/formatted_movie_lines.txt'
string = open(filename, mode='r', encoding='utf8').read()
string = string.replace("'t", "")
string = string.replace("'s", "")

words = wordTokenizer.tokenize(string)
sentences = set(sentTokenier.tokenize(string))

vocab = Counter(words).most_common(vocab_size)
dict = Counter(vocab)
sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences]

new_sentences = []
with open("lines.txt", mode='w', encoding='utf8') as file:
    for sentence in sentences:
        write = True
        for word in sentence:
            if word in dict.keys():
                write = False
                break
        if write:
            file.writelines(" ".join(sentence) + "\n")
            new_sentences.append(sentence)
#Representing the words with their Parts of Speech
import nltk
from nltk.corpus import state_union
''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer 
It comes with pretraining and we can also further train it '''
from nltk import PunktSentenceTokenizer

train = state_union.raw("2005-GWBush.txt")
text = state_union.raw("2006-GWBush.txt")
SentenceTokenizer = PunktSentenceTokenizer(train)

tokenized = SentenceTokenizer.tokenize(text)


def process():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))


process()
示例#19
0
sentences = sent_tokenize(example_text)

for w in words:
    print(w)

print()

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer and training it
train_text = state_union.raw("2005-GWBush.txt")

custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text)

sentences = custom_sentence_tokenizer_trained.tokenize(example_text)

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer with no training (it comes pretrained)
custom_sentence_tokenizer_untrained = PunktSentenceTokenizer()

sentences = custom_sentence_tokenizer_untrained.tokenize(example_text)

for s in sentences:
    print(s)