Пример #1
0
 def __init__(self, docs_dir, docs_size):
     self.docLoader = DocLoader(docs_dir, docs_size)
     self.tokenizer = Tokenizer()
     self.stemmer = Stemmer()
     self.dictionary = Dictionary(load=False)
     self._clean()
     self._setup(docs_size)
Пример #2
0
    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(
            ['rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re',
             r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`",
             r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d',
             '-', '--'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        self.text_tokens = None

        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()

        self.hashtag_split_pattern = re.compile(r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))')
        self.take_off_non_latin = re.compile(
            pattern=r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]')
        self.left_slash_pattern = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$')
        self.right_slash_pattern = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$')

        self.days_dict = {"Sat": "saturday", "Sun": "sunday", "Mon": "monday", "Tue": "tuesday", "Wed": "wednsday",
                          "Thu": "thursday", "Fri": "friday"}
        self.months_dict = {"Jul": ("july", "07"), "Aug": ("august", "08")}

        self.kbm_shorts = {"k": None, "m": None, "b": None, "K": None, "M": None, "B": None}
Пример #3
0
 def __init__(self, stem):
     self.stop_words = stopwords.words('english')
     self.stop_words.extend([
         'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
         'there', 'about', 'once', 'during', 'out', 'very', 'having',
         'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its',
         'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off',
         'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the',
         'themselves', 'until', 'below', 'are', 'we', 'these', 'your',
         'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
         'himself', 'this', 'down', 'should', 'our', 'their', 'while',
         'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when',
         'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in',
         'will', 'on', 'does', 'yourselves', 'then', 'that', 'because',
         'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he',
         'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself',
         'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if',
         'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
         'further', 'was', 'here', 'than', 'rt', "don't", '-', '&',
         'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co',
         'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre',
         'would', 'might'
     ])
     self.stop_words_dict = {
         self.stop_words[i]: 0
         for i in range(0, len(self.stop_words))
     }
     # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0}
     # self.stop_words_dict.update(self.extra_stop_words)
     self.term_dict = {}
     self.toStem = stem
     self.text_tokens = []
     if self.toStem:
         self.stemmer = Stemmer()
Пример #4
0
 def __init__(self, config=None):
     self.tmp_for_entites = {}
     self.stop_words = stopwords.words('english') + [
         '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':',
         '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am',
         'a', 'even', 'every', 'everyone', 'rt', 'RT'
     ]
     self.global_dict = {}  #value=number of docs
     self.post_dict = {
     }  # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf]
     self.entities = {}
     self.path_stop_words = [
         'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW'
     ]
     self.corona_list = [
         "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19',
         'corona virus', 'virus corona', 'corona_virus', 'virus_corona',
         "virus"
     ]
     self.config = config
     self.trump = [
         "donald", "donald trump", "trump donald", "president",
         "trump_donald", "donald_trump", "trump-donald", "donald-trump"
     ]
     self.stemmer = None
     if self.config.toStem:
         self.stemmer = Stemmer()
Пример #5
0
    def stem(self, min_word_count=10):
        stemmer = Stemmer({w:n for (w,n) in self.vocab.items()
                               if n >= min_word_count})

        for mail in self.mails:
            mail.sents = [[stemmer.stem(w) for w in sent] for sent in mail.sents]

        self.stemmer = stemmer
Пример #6
0
 def test_VC_measure(self):
     """Tests the VC measure."""
     stemmer = Stemmer()
     for word, measure in VC_DATA.items():
         self.failUnless(stemmer.m(word) == measure,
                         "Measure test failed for word '%s' calculated (%d) \
                         should have been (%d)" % (word, stemmer.m(word),
                         measure))
Пример #7
0
    def add_new_doc(self, document, documents_list_length=10000):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """

        try:
            document_dictionary = document.term_doc_dictionary
            # self.countDoc += 1
            for term in document_dictionary.keys():
                if self.stemming == 'y':
                    my_stemmer = Stemmer()
                    term = my_stemmer.stem_term(term)
                    # Update inverted index and posting
                if term not in self.inverted_idx.keys():
                    self.inverted_idx[term] = [
                        1, [(document_dictionary[term], document.tweet_id)]
                    ]  # amount of doc, freq in the doc, doc id.

                else:
                    self.inverted_idx[term][0] += 1  # amount of doc
                    self.inverted_idx[term][1].append(
                        (document_dictionary[term],
                         document.tweet_id))  # freq in the doc # doc id

                if term not in self.postingDict.keys():
                    self.postingDict[term] = [(document.tweet_id,
                                               document_dictionary[term])]
                else:
                    self.postingDict[term].append(
                        (document.tweet_id, document_dictionary[term]))
                # self.countTweet -= 1

                if document.tweet_id not in self.tweet_dict.keys():
                    self.tweet_dict[document.tweet_id] = [
                        [term, document_dictionary[term]], 1, 0
                    ]  # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet
                elif document_dictionary[term] > self.tweet_dict[
                        document.tweet_id][0][
                            1]:  # tweet exist, compering between freq in two terms
                    if self.tweet_dict[document.tweet_id][0][
                            1] == 1:  # before change term check if the last term is unique
                        self.tweet_dict[document.tweet_id][
                            1] += 1  # last term is unique: add to the amount of uniqe terms in tweet
                    self.tweet_dict[document.tweet_id][0] = [
                        term, document_dictionary[term]
                    ]  # change between the terms
                    self.tweet_dict[document.tweet_id][2] += 1
                elif document_dictionary[
                        term] == 1:  # tweet exist, not most common, check if unique
                    self.tweet_dict[document.tweet_id][1] += 1
                    self.tweet_dict[document.tweet_id][2] += 1
        except:
            # print('problem in indexer : add_new_doc')
            # print(traceback.print_exc())
            pass
Пример #8
0
 def __init__(self, config):
     self.word_dict = {}
     self.stemmer = Stemmer(config.stemming)
     self.stop_words = [
         self.stemmer.stem_term(word) for word in stopwords.words('english')
     ] + ['rt', 't.co', 'https']
     self.rules = config.parser_rules
     self.spell = SpellChecker()
     self.min_length = config.min_length
Пример #9
0
 def test_VC_measure(self):
     """Tests the VC measure."""
     stemmer = Stemmer()
     for word, measure in VC_DATA.items():
         self.failUnless(
             stemmer.m(word) == measure,
             "Measure test failed for word '%s' calculated (%d) \
                         should have been (%d)" %
             (word, stemmer.m(word), measure))
Пример #10
0
 def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False):
     self.stemmer = Stemmer()
     self.with_stemmer = with_stemmer
     self.include_urls = include_urls
     self.include_quote = include_quote
     self.stop_words = stopwords.words('english')
     self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ]
     self.debug = debug
     self.timer = timer
     self.times = []
Пример #11
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Пример #12
0
    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``",
            r"'", r"`", '"'
        ])
        self.stop_words.extend([
            'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']',
            r'{', '}'
            "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t'
        ])
        self.stop_words.extend(['1️⃣.1️⃣2️⃣'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_pattern = re.compile('http\S+')
        self.url_www_pattern = re.compile("[/://?=]")
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.non_latin_pattern = re.compile(
            pattern=
            r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]'
        )
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )
        # TODO - fix emoji to include all emojis
        self.emojis_pattern = re.compile(
            pattern="["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00010000-\U0010ffff"
            u"\U0001f926-\U0001f937"
            u"\U000024C2-\U0001F251"
            u"\U00002702-\U000027B0"
            u"\u2640-\u2642"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"
            u"\u3030"
            u"\u2600-\u2B55"
            u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3"
            "]+",
            flags=re.UNICODE)
Пример #13
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Пример #14
0
 def __init__(self, stemming):
     self.stop_words = stopwords.words('english')
     self.stop_words += ["rt", "http", "https", "www",
                         "twitter.com"]  # TODO: check &amp
     self.terms = set()
     self.nonstopwords = 0
     self.max_tf = 0
     self.toStem = stemming
     self.entities = {}
     if self.toStem:
         self.stemmer = Stemmer()
Пример #15
0
 def __init__(self, config=None, advanced=False):
     # stopwords_to_add = ['rt']
     self.english_word = words.words()
     self.stop_words = stopwords.words('english')
     puncs_to_add = ['...', '', '\'', '“', '”', '’', '…']
     self.punctuators = [punc for punc in string.punctuation] + puncs_to_add
     self.tt = TweetTokenizer()
     self.stemmer = Stemmer()
     self.need_stemming = config.toStem if isinstance(
         config, ConfigClass) else False
     self.caps_dict = {}
     self.rules_dict = {}
     self.advanced = advanced
    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indice = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        retweet_indice = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        quoted_indice = doc_as_list[10]
        retweet_quoted_text = doc_as_list[11]
        retweet_quoted_url = doc_as_list[12]
        retweet_quoted_indice = doc_as_list[13]

        term_dict = {}

        tokenized_text = self.parse_sentence(full_text)
        tokenized_quote = self.parse_sentence(quote_text)
        tokenized_url = self.handle_url(url)

        doc_length = len(
            tokenized_text)  # after text operations - length of full_text

        new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote

        if self.stemming is True:
            s = Stemmer()
            for token in new_tokenized_text:
                new_tokenized_text.append(s.stem_term(token))
                new_tokenized_text.remove(token)

        for term in new_tokenized_text:
            if term is not "":  # or (term.isalpha() and len(term) == 1)
                if term not in term_dict:
                    term_dict[term] = 1
                else:
                    term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)

        return document
Пример #17
0
    def test_stemmer(self):
        line = "мамочка свари суп"
        #tok = Tokenizer().tokenize_alph(line)

        fact = list(Stemmer().stem(Token(0, 7, line, 'a'), 4, line))

        check = [Token(0, 7, line, 'a'), Token(0, 6, line, 'a'),
                 Token(0, 5, line, 'a'), Token(0, 4, line, 'a'), Token(0, 3, line, "a")]

        fact1 = list(Stemmer().stem(Token(14, 17, line, "a"), 4, line))
        check1 = [Token(14, 17, line, "a")]

        self.assertEqual(fact, check)
        self.assertEqual(fact1, check1)
Пример #18
0
    def test(self):
        print 'Starting analysis'

        for trie_name in self.trie_files:
            print 'Starting', trie_name
            correct_number = 0
            all_number = 0
            s = Stemmer(self.plp, filename=trie_name, word_type=None)
            corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8')
            result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n')
            # for k, v in self.cities.iteritems():
            cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8')
            for city in cities:
                k = city.split(';')[1].strip()
                v = city.split(';')[0].strip()
                all_number += 1
                basic_form = ''
                # word_labels = []
                # if k.__contains__('-'):
                #     for city_parts in v.split('-'):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + '-'
                #         word_labels.append(b.word_labels)
                #     basic_form = basic_form[0:basic_form.__len__() - 1]
                # else:
                #     for city_parts in v.split(' '):
                #         b = s.find_basic_form(city_parts)
                #         basic_form += b.basic_form + ' '
                #         word_labels.append(b.word_labels)

                basic_form = s.find_basic_form(v).basic_form.strip()
                if basic_form != k:
                # if basic_form == k:
                    result_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for w_label in word_labels:
                    #     result_file.write(self.find_most_label(w_label) + ' ')
                    result_file.write('\n')
                else:
                #     corrects_file.write(v + ';' + k + ';' + basic_form + ';')
                    # for label in s.find_labels(word_labels):
                    #     corrects_file.write(label + ' ')
                    # corrects_file.write('\n')
                    correct_number += 1
            result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n')
            result_file.write(
                str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number))
            print 'Done', trie_name
Пример #19
0
 def __init__(self, stemming=None):
     """
     constructor for this class
     :param stemming:
     """
     self.stop_words = stopwords.words('english')
     self.stemmer = None
     if stemming:
         self.stemmer = Stemmer()
     self.corona_list = [
         "SARS", "sars", "Severe Acute Respiratory Syndrome",
         "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV",
         "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID",
         "covid", "Covid", "COVID-19", "covid-19", "#coronavirus",
         "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS"
     ]
class SearchEngine:
    _dictionary: Dictionary
    _tokenizer: Tokenizer
    _stemmer: Stemmer
    _query_result: QueryResult

    def __init__(self):
        self._dictionary = Dictionary(load=True)
        self._tokenizer = Tokenizer()
        self._stemmer = Stemmer()
        self._query_result = QueryResult()
        print(self._dictionary)

    def _search_for_token(self, token: Token):
        pl = self._dictionary.getPostingList(token.getWord())
        print(pl)
        if pl is not None:
            self._query_result.addToResults(token, pl)

    def listen(self):
        inp = input("Enter Your Query: ")
        # inp = "هفته"
        query_tokens = self._tokenizer.tokenizeDoc(inp)
        normalized_query_tokens = self._stemmer.normalize_list(query_tokens)
        for p in normalized_query_tokens:
            self._search_for_token(p)
        self._query_result.buildCandidates()
        self._query_result.printKBestCandidates()
Пример #21
0
class Indexer:
    def __init__(self, docs_dir, docs_size):
        self.docLoader = DocLoader(docs_dir, docs_size)
        self.tokenizer = Tokenizer()
        self.stemmer = Stemmer()
        self.dictionary = Dictionary(load=False)
        self._clean()
        self._setup(docs_size)

    def _setup(self, docs_size):
        for doc_id in range(1, docs_size + 1):
            doc = self.docLoader.getDoc(doc_id)
            tokens = self.tokenizer.tokenizeDoc(doc)
            print("tokens: ")
            for token in tokens:
                print(token)
            normalized_words = self.stemmer.normalize_list(tokens)
            print("normalized_words: ")
            for token in normalized_words:
                print(token)
            for token in normalized_words:
                self.dictionary.addToken(token, doc_id)

    @staticmethod
    def _clean():
        if os.path.exists(os.path.dirname("./dist")):
            try:
                shutil.rmtree("./dist")
            except (FileNotFoundError, FileExistsError) as e:
                print("error")
Пример #22
0
    def __init__(self, stemming=0):
        """
         This function initiate the fields of Parse, init the stemmer and entering stop words
         :param stemming: the boolean value is stem is needed (optional)
         """
        self.stemming = stemming
        self.stemmer = Stemmer()

        # self.stop_words = frozenset(stopwords.words('english')) ??????????????????????????????????????????????????????
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(',
            ')', '*', '+', '='
            '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`',
            '[', ']', '{', '}'
        ])
Пример #23
0
    def test_stemmer_flex(self): 

        line = "мамочка свари суп"

        fact = list(Stemmer().stem_flex(Token(0, 8, "мамочка свари суп", "a")))
        check = [Token(0, 8, line, 'a'), Token(0, 7, line, 'a')]

        self.assertEqual(fact, check)
Пример #24
0
    def __init__(self, rootPath="", inputFolder=""):
        self.metadata = Metadata()

        self.stopper = Stopper()
        stopwords_folder = os.path.join(rootPath, "stopwords")
        print("Preprocessor root path: ", rootPath)
        self.stopper.load_stopwords(stopwords_folder)

        self.normalizer_tokenizer = NormalizationTokenization()
        self.stemmer = Stemmer()

        self.p1_path = ""
        self.p2_path = ""
        self.p3_path = ""

        self.rootPath = rootPath
        self.inputFolder = inputFolder
Пример #25
0
    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(['RT'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_removal_pattern = re.compile(r'(https?://[^\s]+)')
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )
Пример #26
0
 def __init__(self, stemming=False):
     self.stemming = stemming
     self.toStem = Stemmer()
     self.terms_dic_to_document = {}
     #self.lower_set = set()
     #self.upper_set = set()
     self.numberList = {
         "thousand": 'K',
         "million": 'M',
         "billion": 'B',
         "percentage": '%',
         "percent": '%',
         "dollar": '$'
     }
     self.stop_words = stopwords.words('english')
     # contains of all stop words acording to thiers first letter
     self.dict_stop_words = {
         'a': [],
         'b': [],
         'c': [],
         'd': [],
         'e': [],
         'f': [],
         'g': [],
         'h': [],
         'i': [],
         'j': [],
         'k': [],
         'l': [],
         'm': [],
         'n': [],
         'o': [],
         'p': [],
         'q': [],
         'r': [],
         's': [],
         't': [],
         'u': [],
         'v': [],
         'w': [],
         'x': [],
         'y': [],
         'z': []
     }
     # build the dic of stop Word
     for w in self.stop_words:
         self.dict_stop_words[w[0]].append(w)
     # all operator we dont want and all parentheses character and all separators character
     self.skip_list = {
         ',', ';', ':', ' ', '\n', '(', ')', '[', ']', '{', '}', '*', '+',
         '-', '/', '<', '>', '&', '=', '|', '~', '"'
     }
     # all wired symbols
     self.wird_symbols = {
         '!', '#', '$', '%', '&', '(', ')', ',', '*', '+', '-', '.', '/',
         ':', ';', '<', '=', '>', '?', '@', '[', "'\'", ']', '^', '`', '{',
         '|', '}', '~', '}'
     }
Пример #27
0
Файл: main.py Проект: Attil/WEDT
def main(args):
    dl = DataLoader()
    stem = Stemmer('porter')

    # files is a list of files, which are lists of lines, which are lists of words
    files = [{element[0]: stem.stem(element[1]) for element in dl.load_data(file) if stem.stem(element[1])} for file in args]

    for file, arg in zip(files, args):
        print('Processing file {}...'.format(arg))
        file = {k: list(v) for k, v in file.items()}

        print('Data Clusterer')
        test_clusterer(DataClusterer(list(file.values()), 'euclidean'), file)

        print('-'*64)

        print('Description Clusterer')
        test_clusterer(DescriptionClusterer(list(file.values()), 'cosine'), file)
Пример #28
0
    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m',
            '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"",
            r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`",
            r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{',
            '}'
            "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people',
            'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases',
            'amp', 'us', 'like'
        ])
        # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like'
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        self.text_tokens = None

        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()
Пример #29
0
    def __init__(self, corpus=None, cxp=True, swr=True, nr=True, stem=True):
        if corpus != None:
            self.corpus_path = Path(str(corpus))
        else:
            self.corpus_path = None

        self.contraction_expansion_flag = False
        self.stop_word_flag = False
        self.noise_removal_flag = False
        self.stemmer_flag = False

        if cxp:
            self.contraction_expansion_flag = True
            self.contraction_expander = ContractionExpander()
        if swr:
            self.stop_word_flag = True
            self.stop_word_remover = StopWordRemover()
        if nr:
            self.noise_removal_flag = True
            self.noise_remover = NoiseRemover()
        if stem:
            self.stemmer_flag = True
            self.stemmer = Stemmer()
Пример #30
0
    def predict(self, doc):
        # Prepare document
        doc = self.clean(doc)

        # Getting class with highly score
        score = []

        for cat in self.C:
            probability = math.log10(self.DC[cat] / self.D)

            for word in doc.split():
                if len(word) > 2:
                    cur_word = Stemmer.stem(u'{}'.format(word))
                    probability += math.log10(
                        (self.WiC[cat].get(cur_word, 0) + 1) /
                        (len(self.W) + self.WC[cat]))

            score.append(probability)

        return self.C[score.index(max(score))]
Пример #31
0
    def train(self, doc, category):
        # Prepare document
        doc = self.clean(doc)

        # Update classifier:
        # Update D
        self.D += 1

        # Update C & DC
        if category not in self.C:
            self.C.append(category)
            self.DC[category] = 1
        else:
            self.DC[category] += 1

        for word in doc.split():
            if len(word) > 2:
                # 'Normalize' word
                cur_word = Stemmer.stem(u'{}'.format(word))

                # Update W
                if cur_word not in self.W:
                    self.W.append(cur_word)

                # Update WC
                if category not in self.WC.keys():
                    self.WC[category] = 1
                else:
                    self.WC[category] += 1

                # Update Wic
                if category not in self.WiC.keys():
                    self.WiC[category] = {}
                if cur_word not in self.WiC[category].keys():
                    self.WiC[category][cur_word] = 1
                else:
                    self.WiC[category][cur_word] += 1
Пример #32
0
def search_and_rank_query(queries, inverted_index, k, lda):
    #print("start:", datetime.now())

    # config = ConfigClass()
    indexer = Indexer(config)
    # indexer = Indexer(config)
    to_stem = config.get__toStem()
    # to_stem = config.get__toStem()
    queries_list = []
    if type(queries) is list:  # if queries is a list
        for query in queries:
            queries_list.append(query)
    if type(queries) is str:  # if queries is a text file
        with open(queries, encoding='utf-8') as f:
            for line in f:
                if line != "\n":
                    queries_list.append(line)
    all_results = []
    query_num = 1
    tweet_id_num = 1
    for query in queries_list:
        p = Parse(config)
        # parse LDA query
        tokenized_query = p.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                if not to_stem:
                    tokenized_query.append(word)
                else:
                    stem_word = Stemmer().stem_term(word)
                    tokenized_query.append(stem_word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        tokenized_query.append(term)
            counter += len_term
        #print(tokenized_query)
        # WordNet query
        wn = WordNet_ranker(tokenized_query)
        WordNet_query = wn.extend_query()
        #print("WordNet_query", WordNet_query)
        searcher = Searcher(inverted_index)
        #print("inverted_index", len(inverted_index))
        # find relevant_docs
        relevant_docs = searcher.relevant_docs_from_posting(WordNet_query)
        #print("relevant", len(relevant_docs))
        # find LDA relevant
        cosine_dict = lda.prob(tokenized_query)
        #print("cosine dict", len(cosine_dict))

        dict_of_cosine_tweets = {}
        #list out keys and values separately
        key_list = list(indexer.tweet_line_dict.keys())
        val_list = list(indexer.tweet_line_dict.values())
        for index in cosine_dict.keys():  # find the tweet id
            dict_of_cosine_tweets[key_list[val_list.index(
                index)]] = cosine_dict[index]
        #print("finish_topic relevant", len(dict_of_cosine_tweets))

        final_dict = {}
        for tweet_id in dict_of_cosine_tweets.keys():
            if k > len(final_dict):
                if tweet_id in relevant_docs:
                    final_dict[tweet_id] = 0
                    final_dict[tweet_id] += (relevant_docs[tweet_id] +
                                             dict_of_cosine_tweets[tweet_id])

        sorted_cosine_tweets = {
            k: v
            for k, v in sorted(
                final_dict.items(), key=lambda item: item[1], reverse=True)
        }
        final_tweets = list(sorted_cosine_tweets.keys())
        #print("final before add K", len(final_tweets))
        if k > len(final_tweets):
            for key in relevant_docs.keys():
                if key not in final_dict:
                    if k > len(final_tweets):
                        final_tweets.append(key)
                    if k == len(final_tweets):
                        break
        #print("final after K", len(final_tweets))
        #print("relevant", relevant_docs)

        #print("sorted_cosine_tweets", sorted_cosine_tweets)
        """for tweet in relevant_docs.keys():
            if tweet in list_of_cosine_tweets:
                if len(final_tweets) < k:
                    final_tweets.append(tweet)

        if len(final_tweets) < k:
            sorted_cosine_tweets = {k: v for k, v in
                                    sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)}
            for key in sorted_cosine_tweets:
                if k > len(final_tweets) and key not in final_tweets:
                    final_tweets.append(key)
                else:
                    break"""

        # write the results into csv file
        tweet_id_num = 1
        s = ""
        with open('results.csv', 'a', encoding='utf-8') as fp:
            for p in final_tweets:
                s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" +
                     str(tweet_id_num) + "}" + "\n")
                tweet_id_num += 1
                fp.write(s)
        query_num += 1
        all_results.append(final_tweets)
    #print("end:", datetime.now())

    # return top K of final_tweets
    return all_results
Пример #33
0
#query = pattern.getPhoneticCode()
#document = searchEntry5.getPhoneticCode()

#print query
#print document
#print " "
#print pattern.data.comparePhoneticCodeLists(query, document)

#varList = ["halten", "hielt", "gehalt", "haltbar"]
#so = Stemmer("")
#print so.successorVariety ("gehalten", varList)

#varObject = Phonetics("")
#sv = varObject.calcSuccVarietyList(varList)
#print sv
#svm = varObject.calcSuccVarietyMerge(sv)
#print svm
#print varObject.calcSuccVarietyCount(svm)

#text = Advas(["die Kinder freuen sich über die Kastanien"], "")
#keywordList = ["die", "der", "das", "sich"]
#print text.isLanguageByKeywords (keywordList)
#text = Advas(["Schule"], "")
#print text.getSynonyms("/home/frank/projekte/openthesaurus/openthesaurus.txt", "")
#print text.isSynonymOf("Bildungszentrum", "/home/frank/projekte/openthesaurus/openthesaurus.txt", "")

# -- ngram stemmer
stemmerObject = Stemmer("")
print stemmerObject.ngramStemmer(
    ["halten", "hielt", "halter", "halt", "gehalten"], 2, 0.4)
Пример #34
0
from stopwordsremover import StopWordsRemover
from texthandler import TextHandler
from stemmer import Stemmer
from tfidf import TFIDFHandler
from searchhandler import SearchHandler

# Text to be converted
text = """
The 2019–20 coronavirus pandemic is an ongoing pandemic of coronavirus disease 2019 (COVID-19), caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The outbreak was first noted in Wuhan, Hubei province, China, in December 2019. The World Health Organization (WHO) declared the outbreak to be a Public Health Emergency of International Concern on 30 January 2020 and recognized it as a pandemic on 11 March 2020. As of 6 April 2020, more than 1,270,000 cases of COVID-19 have been reported in over 200 countries and territories, resulting in approximately 69,400 deaths. More than 260,000 people have recovered.
"""
# Remove Stopwords or unnessary words such as is, a, and
removed_stopwords_text = StopWordsRemover.remove(text)
# Stem to reduce inflected words to their word stem, base or root form
stemmed_text = Stemmer.stem(removed_stopwords_text)
# Counts number of words has appeared in the document
sanitized_text = TextHandler.WordCounter(stemmed_text)

book1 = {
    "ID": '1',
    "Title": "Covid",
    "Subtitle": "viruses",
    "Author": "author 1",
    "RawText": text,
    "SanitizedText": sanitized_text,
    "RemovedStopWordsText": removed_stopwords_text,
    "TotalNoOfTerms": len(text.lower().split(" ")),
    "TFIDF": 0,
}

text2 = """
Artificial neural networks (ANN) or connectionist systems are computing systems vaguely inspired by the biological neural networks that constitute animal brains. Such systems "learn" to perform tasks by considering examples, generally without being programmed with task-specific rules. For example, in image recognition, they might learn to identify images that contain cats by analyzing example images that have been manually labeled as "cat" or "no cat" and using the results to identify cats in other images. They do this without any prior knowledge of cats, for example, that they have fur, tails, whiskers and cat-like faces. Instead, they automatically generate identifying characteristics from the examples that they process.
    "duree",
    "ville",
    "lieu",
    "labo",
]

outdir = "archives_SFBI_AnnotationManuelle"

mails = list(mailLoaderGen())
words = Counter()
for mail in mails:
    mail.sents = list(iterTokenizedSentences(mail.description))
    for sent in mail.sents:
        words.update(sent)

stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10))

for m in mails:
    outf = outdir + m.mailfile.strip("archives_SFBI")
    d = m.__dict__
    d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y")

    with open(outf, "wt") as f:
        d["from"] = d.pop("sender")
        if m.sfbi:
            ce = d["contact-email"]
            ce = "\t".join(ce) if type(ce) is set else ce
            d["contact-email"] = ce.replace(" [dot] ", ".").replace("[at]", "@")

            cn = d["contact-nom"]
            d["contact-nom"] = "\t".join(cn) if type(cn) is set else cn
Пример #36
0
    def parse_sentence(self, text, tweet_id):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        # print(text)
        text_tokens = word_tokenize(text)
        if text_tokens[0] == 'RT':
            return []

        # find TAGS
        if "@" in text_tokens:
            index_list1 = [n for n, x in enumerate(text_tokens) if x == '@']
            counter = 0
            for index in index_list1:
                if index + 1 < len(text_tokens):
                    if text_tokens[index + 1] != '@':
                        new_term = text_tokens[index] + text_tokens[index + 1]
                        text_tokens.append(new_term)
                        counter += 1
            for sign in range(
                    counter
            ):  # deletes all '@' and the word after it from list
                rmv_index = text_tokens.index('@')
                if rmv_index + 1 < len(text_tokens):
                    if text_tokens[rmv_index + 1] != '@':
                        del text_tokens[rmv_index + 1]
                    else:
                        del text_tokens[rmv_index + 1]
                        del text_tokens[rmv_index + 1]
                text_tokens.remove('@')
##############################################################################################
# find PERCENTAGES
        if "%" or "percent" or "Percent" or "percentage" or "Percentage" in text_tokens:
            index_list2 = [
                n for n, x in enumerate(text_tokens)
                if x == '%' or x == 'percent' or x == "percentage"
                or x == 'Percent' or x == "Percentage"
            ]
            counter2 = 0
            for index in index_list2:
                if index - 1 >= 0:
                    if not re.search('[a-zA-Z]', text_tokens[index - 1]):
                        new_term = text_tokens[index - 1] + '%'
                        text_tokens.append(new_term)
                    if text_tokens[index] == '%':
                        counter2 += 1
            while counter2 > 0:  # deletes all '%' and the word after it from list
                rmv_index = text_tokens.index('%')
                if rmv_index + 1 < len(text_tokens) and text_tokens[
                        rmv_index + 1] == '%':  #if %%
                    del text_tokens[rmv_index + 1]
                    counter2 -= 1
                if rmv_index - 1 >= 0 and not re.search(
                        '[a-zA-Z]', text_tokens[rmv_index - 1]):  #is number
                    del text_tokens[rmv_index]
                    del text_tokens[rmv_index - 1]
                counter2 -= 1
##############################################################################################
# finding terms, entities and capital letter
        self.parse_term(text_tokens, tweet_id)
        ##############################################################################################
        # find NUMBERS
        numbers = []
        for item in text_tokens:  #([0-9]+[,.]+[0-9]+)  item.isnumeric() or item.isdigit() or item.isdecimal() or
            if re.findall("^\d+$|^[0-9]{1,3}([,.\/][0-9]{1,3}){0,6}$",
                          item) and not re.search(
                              '[a-zA-Z]',
                              item):  #^\d+$|^[0-9]{1,3}([,.][0-9]{1,3})?$
                if item.find('-') == -1 and item.find('€') == -1 and item.find(
                        '£') == -1 and item.find('%') == -1 and item.find(
                            '¢') == -1 and item.find('~') == -1 and item.find(
                                '+') == -1 and item.find(
                                    '/') <= 1 and item.find("'") == -1:
                    if item.find(',') == -1:
                        numbers.append(item)
                    elif item.find(',') != -1 and re.findall(
                            "^([0-9]{1,3})(,[0-9]{3})*$", item):
                        numbers.append(item)
        # if len(numbers) >0:
        #     print(numbers)
        fractions_list = []
        for num in numbers:
            occur = num.count('.')
            if occur < 2:  # not a date
                rmv_index = text_tokens.index(num)
                to_append = True
                no_text = True
                found_fractions = False
                if text_tokens[rmv_index].find(
                        "/") != -1 and rmv_index - 1 > 0 and text_tokens[
                            rmv_index - 1].isnumeric():  # if found_fractions
                    all_fractions = text_tokens[
                        rmv_index - 1] + " " + text_tokens[rmv_index]
                    fractions_list.append(all_fractions)
                    found_fractions = True
                    to_append = True
                if rmv_index + 1 < len(text_tokens):  # yes text
                    if text_tokens[rmv_index + 1] == "million" or text_tokens[rmv_index + 1] == "Million" or \
                            text_tokens[rmv_index + 1] == "M" or text_tokens[rmv_index + 1] == "m" or text_tokens[rmv_index + 1] == "MILLION":
                        if len(num) < 6:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if text_tokens[rmv_index + 1] == "billion" or text_tokens[rmv_index + 1] == "Billion" or \
                            text_tokens[rmv_index + 1] == "B" or text_tokens[rmv_index + 1] == "b" or text_tokens[rmv_index + 1] == "BILLION":
                        if len(num) < 9:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000000000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if text_tokens[rmv_index + 1] == "thousand" or text_tokens[rmv_index + 1] == "Thousand" or \
                            text_tokens[rmv_index + 1] == "K" or text_tokens[rmv_index + 1] == "k" or text_tokens[rmv_index + 1] == "THOUSAND":
                        if len(num) < 4:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if not no_text:
                        text_tokens[rmv_index + 1]  # TODO:?????????????????
                if rmv_index - 1 >= 0 and text_tokens[rmv_index -
                                                      1] == '$':  # yes $
                    if no_text:
                        if len(num) > 3:
                            text_tokens.append("$" + self.parse_numbers(num))
                        else:
                            text_tokens.append("$" + num)
                        text_tokens[rmv_index] = " "  # remove $ from list
                        text_tokens[rmv_index - 1] = " "
                    else:
                        text_tokens.append("$" + new_num)
                        text_tokens[rmv_index - 1] = " "  # remove $ from list
                    to_append = False
                if to_append:  # no $
                    if no_text:
                        if len(num) > 3:
                            text_tokens.append(self.parse_numbers(num))
                            text_tokens[
                                rmv_index] = " "  # remove num from list
                    else:
                        text_tokens.append(new_num)
                if found_fractions:  # delete fractions
                    del text_tokens[rmv_index]
                    del text_tokens[rmv_index - 1]
        """punctuations = '''!(-+—[]{};:'",)<>,./?^&*_’~|=→"”“'''  # removes relevant punctuations and http and //short url
        index_count = 0
        for word in text_tokens:
            to_delete = False
            if len(word) > 1 and word.find('-') != -1:  # contains '-'
                text_tokens.extend(word.split('-'))
                text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('…') != -1:  # contains '…'
                if to_delete == False:
                    text_tokens.extend(word.split('…'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('_') != -1:  # contains '_'
                if to_delete == False:
                    text_tokens.extend(word.split('_'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('+') != -1:  # contains '+'
                if to_delete == False:
                    text_tokens.extend(word.split('+'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('/') != -1 and not (word[0] == '/' and word[1] == '/'):  # contains '/'
                if to_delete == False:
                    text_tokens.extend(word.split('/'))
                    text_tokens.remove(word)
                to_delete = True
            if to_delete == False:
                if word in punctuations:
                    i = text_tokens.index(word)
                    text_tokens[i] = " "
                elif word == "http" or word == "https" or word == "http..." or word == "https..." or word == "RT" or word == "rt":
                    i2 = text_tokens.index(word)
                    text_tokens[i2] = " "
                elif len(word) > 1 and word[0] == '/' and word[1] == '/':
                    i3 = text_tokens.index(word)
                    text_tokens[i3] = " "
                else:
                    text_tokens[index_count] = ''.join([i if ord(i) < 128 else '' for i in word])
            index_count += 1
        text_tokens[:] = [x for x in text_tokens if
                          x != " " and x != ".." and x != "..." and x != "...." and x != "....." and x != "......" and
                          x != "``" and x != "''" and x != "'s" and x != "'m" and x != "n't" and x != "." and x != ""
                          and x != "'re" and x != "__" and x != "_" and x != "___" and x != "," and x != "!"]"""
        ##############################################################################################
        # find punctuations
        new_words = []
        regex_pattern_for_num = '.*\d\.\d.*'
        regex_pattern_for_punctuation = 't.co.*|\'m|\'s|n\'t|\'re|\(|\)|\!|\-|\+|\[|\]|\{|\}|\;|\:|\'|\,|\<|\>|\?|\"|\^|\&|\*|\_|\~|\`|\||\=|\→|\/|\”|\“|\’|\—|\.|\``|\\\\|http.*|https.*|^RT$|^rt$'

        for word in text_tokens:
            # if term is a number in form ...d.d.. exp 230.3K - add to list
            if re.match(regex_pattern_for_num, word):
                new_words.append(word)
                continue
            # else - remove all punctuation from the term
            else:
                word = re.sub(regex_pattern_for_punctuation,
                              '',
                              word,
                              flags=re.IGNORECASE)
                word = ''.join([i if ord(i) < 128 else '' for i in word])
                if word == '' or word == ' ':
                    continue

            new_words.append(word)
        text_tokens = new_words
        ##############################################################################################
        # find HASHTAGS
        # TODO: #whereIsKCR combined
        if "#" in text_tokens:
            index_list3 = [n for n, x in enumerate(text_tokens) if x == '#']
            for index in index_list3:
                if index + 1 < len(text_tokens):
                    if text_tokens[index + 1] != '#' and text_tokens[
                            index +
                            1][0] != '@' and text_tokens[index + 1].find(
                                "#") == -1:  #next word is not # and not @
                        if text_tokens[index +
                                       1].find('_') == -1:  # not contains '_'
                            new_term = text_tokens[index] + text_tokens[index +
                                                                        1]
                            text_tokens.append(new_term)
            for sign in range(
                    len(index_list3
                        )):  # deletes all '#' and the word after it from list
                rmv_index = text_tokens.index('#')
                if rmv_index + 1 < len(text_tokens) and text_tokens[rmv_index + 1] != '#'\
                        and text_tokens[rmv_index + 1][0] != '@' and text_tokens[rmv_index + 1].find("#") == -1:
                    word_val = text_tokens[rmv_index + 1]
                    if not word_val.isupper() and not word_val.islower(
                    ) and word_val.find('_') == -1:  # split uppercase
                        list_of_words = re.findall('[A-Z][^A-Z]*', word_val)
                        for word in list_of_words:
                            text_tokens.append(word)
                    if word_val.find('_') != -1:  # split '_'
                        list_of_words = word_val.split('_')
                        new_word = "#"
                        for word in list_of_words:
                            new_word += word
                            text_tokens.append(word)  # appends each word
                        text_tokens.append(new_word)  # appends #word
                    if text_tokens[rmv_index + 1][0] != '@' and (
                        (not word_val.isupper() and not word_val.islower())
                            or word_val.islower() or
                        (word_val.find('_') != -1)):  #TODO: delete #fuck_you
                        del text_tokens[rmv_index + 1]
                text_tokens.remove('#')
##############################################################################################
# add fractions
        text_tokens.extend(fractions_list)
        ##############################################################################################
        # remove stop_words
        text_tokens_without_stopwords = [
            w.lower() for w in text_tokens if w not in self.stop_words
        ]
        # print(text_tokens)
        # print(text_tokens_without_stopwords)
        ##############################################################################################
        # if stemmer
        to_stem = self.config.get__toStem()
        if to_stem:
            stem_text_tokens_without_stopwords = []
            for token in text_tokens_without_stopwords:
                stem_token = Stemmer().stem_term(token)
                stem_text_tokens_without_stopwords.append(stem_token)
            #print(stem_text_tokens_without_stopwords)
            return stem_text_tokens_without_stopwords

        return text_tokens_without_stopwords
Пример #37
0
import codecs

from plp import PLP
from stemmer import Stemmer

__author__ = 'maciej'

plp = PLP()
plp._init()

ile_poprawnych = 0
ile_wszystkich = 0

s = Stemmer(plp, filename='trie.bak', word_type=None)
f = codecs.open('test.txt', 'r', 'utf-8')

for line in f:
    ile_wszystkich += 1
    parts = line.split(',')
    b_form = s.find_basic_form(parts[0])
    if b_form.basic_form.strip() == parts[1].strip():
        ile_poprawnych += 1
    else:
        print b_form.basic_form, ';', parts[1], ';', parts[0]

print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych