Exemplo n.º 1
0
    def stem(self, min_word_count=10):
        stemmer = Stemmer({w:n for (w,n) in self.vocab.items()
                               if n >= min_word_count})

        for mail in self.mails:
            mail.sents = [[stemmer.stem(w) for w in sent] for sent in mail.sents]

        self.stemmer = stemmer
Exemplo n.º 2
0
Arquivo: main.py Projeto: Attil/WEDT
def main(args):
    dl = DataLoader()
    stem = Stemmer('porter')

    # files is a list of files, which are lists of lines, which are lists of words
    files = [{element[0]: stem.stem(element[1]) for element in dl.load_data(file) if stem.stem(element[1])} for file in args]

    for file, arg in zip(files, args):
        print('Processing file {}...'.format(arg))
        file = {k: list(v) for k, v in file.items()}

        print('Data Clusterer')
        test_clusterer(DataClusterer(list(file.values()), 'euclidean'), file)

        print('-'*64)

        print('Description Clusterer')
        test_clusterer(DescriptionClusterer(list(file.values()), 'cosine'), file)
Exemplo n.º 3
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Exemplo n.º 4
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Exemplo n.º 5
0
    def predict(self, doc):
        # Prepare document
        doc = self.clean(doc)

        # Getting class with highly score
        score = []

        for cat in self.C:
            probability = math.log10(self.DC[cat] / self.D)

            for word in doc.split():
                if len(word) > 2:
                    cur_word = Stemmer.stem(u'{}'.format(word))
                    probability += math.log10(
                        (self.WiC[cat].get(cur_word, 0) + 1) /
                        (len(self.W) + self.WC[cat]))

            score.append(probability)

        return self.C[score.index(max(score))]
Exemplo n.º 6
0
    def train(self, doc, category):
        # Prepare document
        doc = self.clean(doc)

        # Update classifier:
        # Update D
        self.D += 1

        # Update C & DC
        if category not in self.C:
            self.C.append(category)
            self.DC[category] = 1
        else:
            self.DC[category] += 1

        for word in doc.split():
            if len(word) > 2:
                # 'Normalize' word
                cur_word = Stemmer.stem(u'{}'.format(word))

                # Update W
                if cur_word not in self.W:
                    self.W.append(cur_word)

                # Update WC
                if category not in self.WC.keys():
                    self.WC[category] = 1
                else:
                    self.WC[category] += 1

                # Update Wic
                if category not in self.WiC.keys():
                    self.WiC[category] = {}
                if cur_word not in self.WiC[category].keys():
                    self.WiC[category][cur_word] = 1
                else:
                    self.WiC[category][cur_word] += 1
Exemplo n.º 7
0
from stopwordsremover import StopWordsRemover
from texthandler import TextHandler
from stemmer import Stemmer
from tfidf import TFIDFHandler
from searchhandler import SearchHandler

# Text to be converted
text = """
The 2019–20 coronavirus pandemic is an ongoing pandemic of coronavirus disease 2019 (COVID-19), caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The outbreak was first noted in Wuhan, Hubei province, China, in December 2019. The World Health Organization (WHO) declared the outbreak to be a Public Health Emergency of International Concern on 30 January 2020 and recognized it as a pandemic on 11 March 2020. As of 6 April 2020, more than 1,270,000 cases of COVID-19 have been reported in over 200 countries and territories, resulting in approximately 69,400 deaths. More than 260,000 people have recovered.
"""
# Remove Stopwords or unnessary words such as is, a, and
removed_stopwords_text = StopWordsRemover.remove(text)
# Stem to reduce inflected words to their word stem, base or root form
stemmed_text = Stemmer.stem(removed_stopwords_text)
# Counts number of words has appeared in the document
sanitized_text = TextHandler.WordCounter(stemmed_text)

book1 = {
    "ID": '1',
    "Title": "Covid",
    "Subtitle": "viruses",
    "Author": "author 1",
    "RawText": text,
    "SanitizedText": sanitized_text,
    "RemovedStopWordsText": removed_stopwords_text,
    "TotalNoOfTerms": len(text.lower().split(" ")),
    "TFIDF": 0,
}

text2 = """
Artificial neural networks (ANN) or connectionist systems are computing systems vaguely inspired by the biological neural networks that constitute animal brains. Such systems "learn" to perform tasks by considering examples, generally without being programmed with task-specific rules. For example, in image recognition, they might learn to identify images that contain cats by analyzing example images that have been manually labeled as "cat" or "no cat" and using the results to identify cats in other images. They do this without any prior knowledge of cats, for example, that they have fur, tails, whiskers and cat-like faces. Instead, they automatically generate identifying characteristics from the examples that they process.
Exemplo n.º 8
0
 def query(self, value):
     self._query = Stemmer.stem(
         value)  # Automatically Stem the input of the user
Exemplo n.º 9
0
    def _get_features(self, tokens, idx):
        stemmer = Stemmer()
        numbs = numbers.values()
        puncts = punctuations.values()

        token = stemmer.stem(tokens[idx])
        feature_list = []

        if not token:
            return feature_list

        for number in numbs:
            if number in list(token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(token):
                feature_list.append("PUNCTUATION")

        feature_list.append("WORD_" + token)

        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
            feature_list.append("PRE_" + token[:3])

        if idx >= 1:
            previous_token = stemmer.stem(tokens[idx - 1])
            if not previous_token:
                return feature_list

            for number in numbs:
                if number in list(previous_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(previous_token):
                    feature_list.append("PUNCTUATION")

            if len(previous_token) > 1:
                feature_list.append("SUF_" + previous_token[-1:])
                feature_list.append("PRE_" + previous_token[:1])
            if len(previous_token) > 2:
                feature_list.append("SUF_" + previous_token[-2:])
                feature_list.append("PRE_" + previous_token[:2])
            if len(previous_token) > 3:
                feature_list.append("SUF_" + previous_token[-3:])
                feature_list.append("PRE_" + previous_token[:3])

            feature_list.append("PREV_WORD_" + previous_token)

        if idx >= 2:
            previous_token = stemmer.stem(tokens[idx - 2])
            if not previous_token:
                return feature_list

            for number in numbs:
                if number in list(previous_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(previous_token):
                    feature_list.append("PUNCTUATION")

            if len(previous_token) > 1:
                feature_list.append("SUF_" + previous_token[-1:])
                feature_list.append("PRE_" + previous_token[:1])
            if len(previous_token) > 2:
                feature_list.append("SUF_" + previous_token[-2:])
                feature_list.append("PRE_" + previous_token[:2])
            if len(previous_token) > 3:
                feature_list.append("SUF_" + previous_token[-3:])
                feature_list.append("PRE_" + previous_token[:3])

            feature_list.append("PREV_PREV_WORD_" + previous_token)

        if idx < len(tokens) - 1:
            next_token = stemmer.stem(tokens[idx + 1])
            if not next_token:
                return feature_list

            for number in numbs:
                if number in list(next_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(next_token):
                    feature_list.append("PUNCTUATION")

            if len(next_token) > 1:
                feature_list.append("SUF_" + next_token[-1:])
                feature_list.append("PRE_" + next_token[:1])
            if len(next_token) > 2:
                feature_list.append("SUF_" + next_token[-2:])
                feature_list.append("PRE_" + next_token[:2])
            if len(next_token) > 3:
                feature_list.append("SUF_" + next_token[-3:])
                feature_list.append("PRE_" + next_token[:3])

            feature_list.append("NEXT_WORD_" + next_token)

        if idx < len(tokens) - 2:
            next_token = stemmer.stem(tokens[idx + 2])
            if not next_token:
                return feature_list

            for number in numbs:
                if number in list(next_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(next_token):
                    feature_list.append("PUNCTUATION")

            if len(next_token) > 1:
                feature_list.append("SUF_" + next_token[-1:])
                feature_list.append("PRE_" + next_token[:1])
            if len(next_token) > 2:
                feature_list.append("SUF_" + next_token[-2:])
                feature_list.append("PRE_" + next_token[:2])
            if len(next_token) > 3:
                feature_list.append("SUF_" + next_token[-3:])
                feature_list.append("PRE_" + next_token[:3])

            feature_list.append("NEXT_NEXT_WORD_" + next_token)

        return feature_list
Exemplo n.º 10
0
 def pos_tag(self, sentence):
     stemmer = Stemmer()
     sent = stemmer.stem(sentence)
     sent = WordTokenizer(sent)
     tags = self.tag(sent)
     return tags
Exemplo n.º 11
0
# In[100]:

import re
import numpy as np
import pandas as pd
from stemmer import Stemmer

data = pd.read_excel("Gujarati_Dimensionality_Reduction.xlsx")

input_text = input()

s1 = "રોગ"
if input_text.startswith(s1):
    flag = 1
    stemmer = Stemmer()
    stemmed_text = stemmer.stem(input_text)
    stemmed_words = re.split(r'[;|,|\s]\s*', stemmed_text)

    diseases_dire = []

    for i in range(len(stemmed_words)):
        for col in data['રોગ']:
            if (stemmed_words[i] == col and stemmed_words[i] != 'રોગ'):
                diseases_dire.append(stemmed_words[i])

    print("રોગો મેળ ખાતા ફોર્મ ડેટાસેટ : ", diseases_dire)
    diseases_dire = np.array(diseases_dire)

    data2 = pd.read_excel("Gujarati_Dataset2.xlsx")

    Y = data2[data2.columns[0]].as_matrix()
    d = m.__dict__
    d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y")

    with open(outf, "wt") as f:
        d["from"] = d.pop("sender")
        if m.sfbi:
            ce = d["contact-email"]
            ce = "\t".join(ce) if type(ce) is set else ce
            d["contact-email"] = ce.replace(" [dot] ", ".").replace("[at]", "@")

            cn = d["contact-nom"]
            d["contact-nom"] = "\t".join(cn) if type(cn) is set else cn

            if "lieu" in d:
                # TODO: A mettre dans le parser web:
                d["lieu"] = d["lieu"].replace("\n\n", "\n")

            f.write("!!! Ceci est le contenu d'une annonce sur le site sfbi.fr.\n")
        for field in fields:
            if field in d:
                f.write("* {}: {}\n".format(field, d[field]))

        f.write("-" * 60 + "\n")

        f.write(m.description)

        f.write("-" * 60 + "\n")

        for sent in m.sents:
            f.write(" ".join(stemmer.stem(word) for word in sent) + "\n")