Пример #1
0
    def test_stemmer(self):
        line = "мамочка свари суп"
        #tok = Tokenizer().tokenize_alph(line)

        fact = list(Stemmer().stem(Token(0, 7, line, 'a'), 4, line))

        check = [Token(0, 7, line, 'a'), Token(0, 6, line, 'a'),
                 Token(0, 5, line, 'a'), Token(0, 4, line, 'a'), Token(0, 3, line, "a")]

        fact1 = list(Stemmer().stem(Token(14, 17, line, "a"), 4, line))
        check1 = [Token(14, 17, line, "a")]

        self.assertEqual(fact, check)
        self.assertEqual(fact1, check1)
Пример #2
0
 def __init__(self, config=None):
     self.tmp_for_entites = {}
     self.stop_words = stopwords.words('english') + [
         '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':',
         '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am',
         'a', 'even', 'every', 'everyone', 'rt', 'RT'
     ]
     self.global_dict = {}  #value=number of docs
     self.post_dict = {
     }  # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf]
     self.entities = {}
     self.path_stop_words = [
         'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW'
     ]
     self.corona_list = [
         "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19',
         'corona virus', 'virus corona', 'corona_virus', 'virus_corona',
         "virus"
     ]
     self.config = config
     self.trump = [
         "donald", "donald trump", "trump donald", "president",
         "trump_donald", "donald_trump", "trump-donald", "donald-trump"
     ]
     self.stemmer = None
     if self.config.toStem:
         self.stemmer = Stemmer()
Пример #3
0
 def __init__(self, docs_dir, docs_size):
     self.docLoader = DocLoader(docs_dir, docs_size)
     self.tokenizer = Tokenizer()
     self.stemmer = Stemmer()
     self.dictionary = Dictionary(load=False)
     self._clean()
     self._setup(docs_size)
Пример #4
0
 def __init__(self, stem):
     self.stop_words = stopwords.words('english')
     self.stop_words.extend([
         'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
         'there', 'about', 'once', 'during', 'out', 'very', 'having',
         'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its',
         'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off',
         'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the',
         'themselves', 'until', 'below', 'are', 'we', 'these', 'your',
         'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
         'himself', 'this', 'down', 'should', 'our', 'their', 'while',
         'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when',
         'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in',
         'will', 'on', 'does', 'yourselves', 'then', 'that', 'because',
         'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he',
         'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself',
         'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if',
         'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
         'further', 'was', 'here', 'than', 'rt', "don't", '-', '&amp;',
         'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co',
         'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre',
         'would', 'might'
     ])
     self.stop_words_dict = {
         self.stop_words[i]: 0
         for i in range(0, len(self.stop_words))
     }
     # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0}
     # self.stop_words_dict.update(self.extra_stop_words)
     self.term_dict = {}
     self.toStem = stem
     self.text_tokens = []
     if self.toStem:
         self.stemmer = Stemmer()
Пример #5
0
    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(
            ['rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re',
             r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`",
             r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d',
             '-', '--'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        self.text_tokens = None

        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()

        self.hashtag_split_pattern = re.compile(r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))')
        self.take_off_non_latin = re.compile(
            pattern=r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]')
        self.left_slash_pattern = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$')
        self.right_slash_pattern = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$')

        self.days_dict = {"Sat": "saturday", "Sun": "sunday", "Mon": "monday", "Tue": "tuesday", "Wed": "wednsday",
                          "Thu": "thursday", "Fri": "friday"}
        self.months_dict = {"Jul": ("july", "07"), "Aug": ("august", "08")}

        self.kbm_shorts = {"k": None, "m": None, "b": None, "K": None, "M": None, "B": None}
Пример #6
0
    def test_stemmer_flex(self): 

        line = "мамочка свари суп"

        fact = list(Stemmer().stem_flex(Token(0, 8, "мамочка свари суп", "a")))
        check = [Token(0, 8, line, 'a'), Token(0, 7, line, 'a')]

        self.assertEqual(fact, check)
Пример #7
0
 def __init__(self, stemming=False):
     self.stemming = stemming
     self.toStem = Stemmer()
     self.terms_dic_to_document = {}
     #self.lower_set = set()
     #self.upper_set = set()
     self.numberList = {
         "thousand": 'K',
         "million": 'M',
         "billion": 'B',
         "percentage": '%',
         "percent": '%',
         "dollar": '$'
     }
     self.stop_words = stopwords.words('english')
     # contains of all stop words acording to thiers first letter
     self.dict_stop_words = {
         'a': [],
         'b': [],
         'c': [],
         'd': [],
         'e': [],
         'f': [],
         'g': [],
         'h': [],
         'i': [],
         'j': [],
         'k': [],
         'l': [],
         'm': [],
         'n': [],
         'o': [],
         'p': [],
         'q': [],
         'r': [],
         's': [],
         't': [],
         'u': [],
         'v': [],
         'w': [],
         'x': [],
         'y': [],
         'z': []
     }
     # build the dic of stop Word
     for w in self.stop_words:
         self.dict_stop_words[w[0]].append(w)
     # all operator we dont want and all parentheses character and all separators character
     self.skip_list = {
         ',', ';', ':', ' ', '\n', '(', ')', '[', ']', '{', '}', '*', '+',
         '-', '/', '<', '>', '&', '=', '|', '~', '"'
     }
     # all wired symbols
     self.wird_symbols = {
         '!', '#', '$', '%', '&', '(', ')', ',', '*', '+', '-', '.', '/',
         ':', ';', '<', '=', '>', '?', '@', '[', "'\'", ']', '^', '`', '{',
         '|', '}', '~', '}'
     }
Пример #8
0
    def add_new_doc(self, document, documents_list_length=10000):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """

        try:
            document_dictionary = document.term_doc_dictionary
            # self.countDoc += 1
            for term in document_dictionary.keys():
                if self.stemming == 'y':
                    my_stemmer = Stemmer()
                    term = my_stemmer.stem_term(term)
                    # Update inverted index and posting
                if term not in self.inverted_idx.keys():
                    self.inverted_idx[term] = [
                        1, [(document_dictionary[term], document.tweet_id)]
                    ]  # amount of doc, freq in the doc, doc id.

                else:
                    self.inverted_idx[term][0] += 1  # amount of doc
                    self.inverted_idx[term][1].append(
                        (document_dictionary[term],
                         document.tweet_id))  # freq in the doc # doc id

                if term not in self.postingDict.keys():
                    self.postingDict[term] = [(document.tweet_id,
                                               document_dictionary[term])]
                else:
                    self.postingDict[term].append(
                        (document.tweet_id, document_dictionary[term]))
                # self.countTweet -= 1

                if document.tweet_id not in self.tweet_dict.keys():
                    self.tweet_dict[document.tweet_id] = [
                        [term, document_dictionary[term]], 1, 0
                    ]  # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet
                elif document_dictionary[term] > self.tweet_dict[
                        document.tweet_id][0][
                            1]:  # tweet exist, compering between freq in two terms
                    if self.tweet_dict[document.tweet_id][0][
                            1] == 1:  # before change term check if the last term is unique
                        self.tweet_dict[document.tweet_id][
                            1] += 1  # last term is unique: add to the amount of uniqe terms in tweet
                    self.tweet_dict[document.tweet_id][0] = [
                        term, document_dictionary[term]
                    ]  # change between the terms
                    self.tweet_dict[document.tweet_id][2] += 1
                elif document_dictionary[
                        term] == 1:  # tweet exist, not most common, check if unique
                    self.tweet_dict[document.tweet_id][1] += 1
                    self.tweet_dict[document.tweet_id][2] += 1
        except:
            # print('problem in indexer : add_new_doc')
            # print(traceback.print_exc())
            pass
Пример #9
0
 def test_VC_measure(self):
     """Tests the VC measure."""
     stemmer = Stemmer()
     for word, measure in VC_DATA.items():
         self.failUnless(
             stemmer.m(word) == measure,
             "Measure test failed for word '%s' calculated (%d) \
                         should have been (%d)" %
             (word, stemmer.m(word), measure))
Пример #10
0
 def __init__(self, config):
     self.word_dict = {}
     self.stemmer = Stemmer(config.stemming)
     self.stop_words = [
         self.stemmer.stem_term(word) for word in stopwords.words('english')
     ] + ['rt', 't.co', 'https']
     self.rules = config.parser_rules
     self.spell = SpellChecker()
     self.min_length = config.min_length
Пример #11
0
 def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False):
     self.stemmer = Stemmer()
     self.with_stemmer = with_stemmer
     self.include_urls = include_urls
     self.include_quote = include_quote
     self.stop_words = stopwords.words('english')
     self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ]
     self.debug = debug
     self.timer = timer
     self.times = []
Пример #12
0
 def __init__(self, stemming):
     self.stop_words = stopwords.words('english')
     self.stop_words += ["rt", "http", "https", "www",
                         "twitter.com"]  # TODO: check &amp
     self.terms = set()
     self.nonstopwords = 0
     self.max_tf = 0
     self.toStem = stemming
     self.entities = {}
     if self.toStem:
         self.stemmer = Stemmer()
Пример #13
0
 def test_stem(self):
     """Checks the final stems."""
     stemmer = Stemmer()
     output = file('output.txt')
     for word in file('voc.txt'):
         word = word.strip()
         stem = output.next().strip()
         self.failUnless(stemmer.stem(word) == stem,
                         "Test failed for word \'%s\' stemmed "\
                         "to %s should have been %s"\
                         % (word, stemmer.stemmed, stem))
Пример #14
0
    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``",
            r"'", r"`", '"'
        ])
        self.stop_words.extend([
            'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']',
            r'{', '}'
            "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t'
        ])
        self.stop_words.extend(['1️⃣.1️⃣2️⃣'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_pattern = re.compile('http\S+')
        self.url_www_pattern = re.compile("[/://?=]")
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.non_latin_pattern = re.compile(
            pattern=
            r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]'
        )
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )
        # TODO - fix emoji to include all emojis
        self.emojis_pattern = re.compile(
            pattern="["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00010000-\U0010ffff"
            u"\U0001f926-\U0001f937"
            u"\U000024C2-\U0001F251"
            u"\U00002702-\U000027B0"
            u"\u2640-\u2642"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"
            u"\u3030"
            u"\u2600-\u2B55"
            u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3"
            "]+",
            flags=re.UNICODE)
Пример #15
0
 def __init__(self, config=None, advanced=False):
     # stopwords_to_add = ['rt']
     self.english_word = words.words()
     self.stop_words = stopwords.words('english')
     puncs_to_add = ['...', '', '\'', '“', '”', '’', '…']
     self.punctuators = [punc for punc in string.punctuation] + puncs_to_add
     self.tt = TweetTokenizer()
     self.stemmer = Stemmer()
     self.need_stemming = config.toStem if isinstance(
         config, ConfigClass) else False
     self.caps_dict = {}
     self.rules_dict = {}
     self.advanced = advanced
    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indice = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        retweet_indice = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        quoted_indice = doc_as_list[10]
        retweet_quoted_text = doc_as_list[11]
        retweet_quoted_url = doc_as_list[12]
        retweet_quoted_indice = doc_as_list[13]

        term_dict = {}

        tokenized_text = self.parse_sentence(full_text)
        tokenized_quote = self.parse_sentence(quote_text)
        tokenized_url = self.handle_url(url)

        doc_length = len(
            tokenized_text)  # after text operations - length of full_text

        new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote

        if self.stemming is True:
            s = Stemmer()
            for token in new_tokenized_text:
                new_tokenized_text.append(s.stem_term(token))
                new_tokenized_text.remove(token)

        for term in new_tokenized_text:
            if term is not "":  # or (term.isalpha() and len(term) == 1)
                if term not in term_dict:
                    term_dict[term] = 1
                else:
                    term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)

        return document
Пример #17
0
    def __init__(self, stemming=0):
        """
         This function initiate the fields of Parse, init the stemmer and entering stop words
         :param stemming: the boolean value is stem is needed (optional)
         """
        self.stemming = stemming
        self.stemmer = Stemmer()

        # self.stop_words = frozenset(stopwords.words('english')) ??????????????????????????????????????????????????????
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(',
            ')', '*', '+', '='
            '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`',
            '[', ']', '{', '}'
        ])
Пример #18
0
 def __init__(self, stemming=None):
     """
     constructor for this class
     :param stemming:
     """
     self.stop_words = stopwords.words('english')
     self.stemmer = None
     if stemming:
         self.stemmer = Stemmer()
     self.corona_list = [
         "SARS", "sars", "Severe Acute Respiratory Syndrome",
         "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV",
         "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID",
         "covid", "Covid", "COVID-19", "covid-19", "#coronavirus",
         "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS"
     ]
Пример #19
0
    def __init__(self, rootPath="", inputFolder=""):
        self.metadata = Metadata()

        self.stopper = Stopper()
        stopwords_folder = os.path.join(rootPath, "stopwords")
        print("Preprocessor root path: ", rootPath)
        self.stopper.load_stopwords(stopwords_folder)

        self.normalizer_tokenizer = NormalizationTokenization()
        self.stemmer = Stemmer()

        self.p1_path = ""
        self.p2_path = ""
        self.p3_path = ""

        self.rootPath = rootPath
        self.inputFolder = inputFolder
Пример #20
0
    def __init__(self, config):
        self.with_stem = config.get_toStem()
        self.stemmer = Stemmer()
        self.stop_words = stopwords.words('english')
        self.stop_words.extend(['RT'])
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        # for avg
        self.total_len_docs = 0
        self.number_of_documents = 0

        self.url_removal_pattern = re.compile(r'(https?://[^\s]+)')
        # TODO - fix numbers pattern
        self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*'))
        self.dates_pattern = re.compile(
            r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$'
        )
Пример #21
0
    def __init__(self, stemming):
        self.stop_words = stopwords.words('english')
        self.stop_words.extend([
            'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m',
            '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"",
            r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`",
            r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{',
            '}'
            "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people',
            'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases',
            'amp', 'us', 'like'
        ])
        # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like'
        self.stop_words_dict = dict.fromkeys(self.stop_words)

        self.text_tokens = None

        self.stemmer = None
        if stemming:
            self.stemmer = Stemmer()
Пример #22
0
    def __init__(self, corpus=None, cxp=True, swr=True, nr=True, stem=True):
        if corpus != None:
            self.corpus_path = Path(str(corpus))
        else:
            self.corpus_path = None

        self.contraction_expansion_flag = False
        self.stop_word_flag = False
        self.noise_removal_flag = False
        self.stemmer_flag = False

        if cxp:
            self.contraction_expansion_flag = True
            self.contraction_expander = ContractionExpander()
        if swr:
            self.stop_word_flag = True
            self.stop_word_remover = StopWordRemover()
        if nr:
            self.noise_removal_flag = True
            self.noise_remover = NoiseRemover()
        if stem:
            self.stemmer_flag = True
            self.stemmer = Stemmer()
Пример #23
0
def search_and_rank_query(queries, inverted_index, k, lda):
    #print("start:", datetime.now())

    # config = ConfigClass()
    indexer = Indexer(config)
    # indexer = Indexer(config)
    to_stem = config.get__toStem()
    # to_stem = config.get__toStem()
    queries_list = []
    if type(queries) is list:  # if queries is a list
        for query in queries:
            queries_list.append(query)
    if type(queries) is str:  # if queries is a text file
        with open(queries, encoding='utf-8') as f:
            for line in f:
                if line != "\n":
                    queries_list.append(line)
    all_results = []
    query_num = 1
    tweet_id_num = 1
    for query in queries_list:
        p = Parse(config)
        # parse LDA query
        tokenized_query = p.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                if not to_stem:
                    tokenized_query.append(word)
                else:
                    stem_word = Stemmer().stem_term(word)
                    tokenized_query.append(stem_word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        tokenized_query.append(term)
            counter += len_term
        #print(tokenized_query)
        # WordNet query
        wn = WordNet_ranker(tokenized_query)
        WordNet_query = wn.extend_query()
        #print("WordNet_query", WordNet_query)
        searcher = Searcher(inverted_index)
        #print("inverted_index", len(inverted_index))
        # find relevant_docs
        relevant_docs = searcher.relevant_docs_from_posting(WordNet_query)
        #print("relevant", len(relevant_docs))
        # find LDA relevant
        cosine_dict = lda.prob(tokenized_query)
        #print("cosine dict", len(cosine_dict))

        dict_of_cosine_tweets = {}
        #list out keys and values separately
        key_list = list(indexer.tweet_line_dict.keys())
        val_list = list(indexer.tweet_line_dict.values())
        for index in cosine_dict.keys():  # find the tweet id
            dict_of_cosine_tweets[key_list[val_list.index(
                index)]] = cosine_dict[index]
        #print("finish_topic relevant", len(dict_of_cosine_tweets))

        final_dict = {}
        for tweet_id in dict_of_cosine_tweets.keys():
            if k > len(final_dict):
                if tweet_id in relevant_docs:
                    final_dict[tweet_id] = 0
                    final_dict[tweet_id] += (relevant_docs[tweet_id] +
                                             dict_of_cosine_tweets[tweet_id])

        sorted_cosine_tweets = {
            k: v
            for k, v in sorted(
                final_dict.items(), key=lambda item: item[1], reverse=True)
        }
        final_tweets = list(sorted_cosine_tweets.keys())
        #print("final before add K", len(final_tweets))
        if k > len(final_tweets):
            for key in relevant_docs.keys():
                if key not in final_dict:
                    if k > len(final_tweets):
                        final_tweets.append(key)
                    if k == len(final_tweets):
                        break
        #print("final after K", len(final_tweets))
        #print("relevant", relevant_docs)

        #print("sorted_cosine_tweets", sorted_cosine_tweets)
        """for tweet in relevant_docs.keys():
            if tweet in list_of_cosine_tweets:
                if len(final_tweets) < k:
                    final_tweets.append(tweet)

        if len(final_tweets) < k:
            sorted_cosine_tweets = {k: v for k, v in
                                    sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)}
            for key in sorted_cosine_tweets:
                if k > len(final_tweets) and key not in final_tweets:
                    final_tweets.append(key)
                else:
                    break"""

        # write the results into csv file
        tweet_id_num = 1
        s = ""
        with open('results.csv', 'a', encoding='utf-8') as fp:
            for p in final_tweets:
                s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" +
                     str(tweet_id_num) + "}" + "\n")
                tweet_id_num += 1
                fp.write(s)
        query_num += 1
        all_results.append(final_tweets)
    #print("end:", datetime.now())

    # return top K of final_tweets
    return all_results
Пример #24
0
#query = pattern.getPhoneticCode()
#document = searchEntry5.getPhoneticCode()

#print query
#print document
#print " "
#print pattern.data.comparePhoneticCodeLists(query, document)

#varList = ["halten", "hielt", "gehalt", "haltbar"]
#so = Stemmer("")
#print so.successorVariety ("gehalten", varList)

#varObject = Phonetics("")
#sv = varObject.calcSuccVarietyList(varList)
#print sv
#svm = varObject.calcSuccVarietyMerge(sv)
#print svm
#print varObject.calcSuccVarietyCount(svm)

#text = Advas(["die Kinder freuen sich über die Kastanien"], "")
#keywordList = ["die", "der", "das", "sich"]
#print text.isLanguageByKeywords (keywordList)
#text = Advas(["Schule"], "")
#print text.getSynonyms("/home/frank/projekte/openthesaurus/openthesaurus.txt", "")
#print text.isSynonymOf("Bildungszentrum", "/home/frank/projekte/openthesaurus/openthesaurus.txt", "")

# -- ngram stemmer
stemmerObject = Stemmer("")
print stemmerObject.ngramStemmer(
    ["halten", "hielt", "halter", "halt", "gehalten"], 2, 0.4)
Пример #25
0
    def parse_sentence(self, text, tweet_id):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        # print(text)
        text_tokens = word_tokenize(text)
        if text_tokens[0] == 'RT':
            return []

        # find TAGS
        if "@" in text_tokens:
            index_list1 = [n for n, x in enumerate(text_tokens) if x == '@']
            counter = 0
            for index in index_list1:
                if index + 1 < len(text_tokens):
                    if text_tokens[index + 1] != '@':
                        new_term = text_tokens[index] + text_tokens[index + 1]
                        text_tokens.append(new_term)
                        counter += 1
            for sign in range(
                    counter
            ):  # deletes all '@' and the word after it from list
                rmv_index = text_tokens.index('@')
                if rmv_index + 1 < len(text_tokens):
                    if text_tokens[rmv_index + 1] != '@':
                        del text_tokens[rmv_index + 1]
                    else:
                        del text_tokens[rmv_index + 1]
                        del text_tokens[rmv_index + 1]
                text_tokens.remove('@')
##############################################################################################
# find PERCENTAGES
        if "%" or "percent" or "Percent" or "percentage" or "Percentage" in text_tokens:
            index_list2 = [
                n for n, x in enumerate(text_tokens)
                if x == '%' or x == 'percent' or x == "percentage"
                or x == 'Percent' or x == "Percentage"
            ]
            counter2 = 0
            for index in index_list2:
                if index - 1 >= 0:
                    if not re.search('[a-zA-Z]', text_tokens[index - 1]):
                        new_term = text_tokens[index - 1] + '%'
                        text_tokens.append(new_term)
                    if text_tokens[index] == '%':
                        counter2 += 1
            while counter2 > 0:  # deletes all '%' and the word after it from list
                rmv_index = text_tokens.index('%')
                if rmv_index + 1 < len(text_tokens) and text_tokens[
                        rmv_index + 1] == '%':  #if %%
                    del text_tokens[rmv_index + 1]
                    counter2 -= 1
                if rmv_index - 1 >= 0 and not re.search(
                        '[a-zA-Z]', text_tokens[rmv_index - 1]):  #is number
                    del text_tokens[rmv_index]
                    del text_tokens[rmv_index - 1]
                counter2 -= 1
##############################################################################################
# finding terms, entities and capital letter
        self.parse_term(text_tokens, tweet_id)
        ##############################################################################################
        # find NUMBERS
        numbers = []
        for item in text_tokens:  #([0-9]+[,.]+[0-9]+)  item.isnumeric() or item.isdigit() or item.isdecimal() or
            if re.findall("^\d+$|^[0-9]{1,3}([,.\/][0-9]{1,3}){0,6}$",
                          item) and not re.search(
                              '[a-zA-Z]',
                              item):  #^\d+$|^[0-9]{1,3}([,.][0-9]{1,3})?$
                if item.find('-') == -1 and item.find('€') == -1 and item.find(
                        '£') == -1 and item.find('%') == -1 and item.find(
                            '¢') == -1 and item.find('~') == -1 and item.find(
                                '+') == -1 and item.find(
                                    '/') <= 1 and item.find("'") == -1:
                    if item.find(',') == -1:
                        numbers.append(item)
                    elif item.find(',') != -1 and re.findall(
                            "^([0-9]{1,3})(,[0-9]{3})*$", item):
                        numbers.append(item)
        # if len(numbers) >0:
        #     print(numbers)
        fractions_list = []
        for num in numbers:
            occur = num.count('.')
            if occur < 2:  # not a date
                rmv_index = text_tokens.index(num)
                to_append = True
                no_text = True
                found_fractions = False
                if text_tokens[rmv_index].find(
                        "/") != -1 and rmv_index - 1 > 0 and text_tokens[
                            rmv_index - 1].isnumeric():  # if found_fractions
                    all_fractions = text_tokens[
                        rmv_index - 1] + " " + text_tokens[rmv_index]
                    fractions_list.append(all_fractions)
                    found_fractions = True
                    to_append = True
                if rmv_index + 1 < len(text_tokens):  # yes text
                    if text_tokens[rmv_index + 1] == "million" or text_tokens[rmv_index + 1] == "Million" or \
                            text_tokens[rmv_index + 1] == "M" or text_tokens[rmv_index + 1] == "m" or text_tokens[rmv_index + 1] == "MILLION":
                        if len(num) < 6:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if text_tokens[rmv_index + 1] == "billion" or text_tokens[rmv_index + 1] == "Billion" or \
                            text_tokens[rmv_index + 1] == "B" or text_tokens[rmv_index + 1] == "b" or text_tokens[rmv_index + 1] == "BILLION":
                        if len(num) < 9:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000000000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if text_tokens[rmv_index + 1] == "thousand" or text_tokens[rmv_index + 1] == "Thousand" or \
                            text_tokens[rmv_index + 1] == "K" or text_tokens[rmv_index + 1] == "k" or text_tokens[rmv_index + 1] == "THOUSAND":
                        if len(num) < 4:
                            fixed_num = re.sub("[^\d\.]", "",
                                               num)  # remove comma
                            new_num = self.parse_numbers(
                                str(float(fixed_num) * 1000))
                        else:
                            new_num = self.parse_numbers(num)
                        no_text = False
                        text_tokens[rmv_index + 1] = " "  # remove from list
                        text_tokens[rmv_index] = " "
                    if not no_text:
                        text_tokens[rmv_index + 1]  # TODO:?????????????????
                if rmv_index - 1 >= 0 and text_tokens[rmv_index -
                                                      1] == '$':  # yes $
                    if no_text:
                        if len(num) > 3:
                            text_tokens.append("$" + self.parse_numbers(num))
                        else:
                            text_tokens.append("$" + num)
                        text_tokens[rmv_index] = " "  # remove $ from list
                        text_tokens[rmv_index - 1] = " "
                    else:
                        text_tokens.append("$" + new_num)
                        text_tokens[rmv_index - 1] = " "  # remove $ from list
                    to_append = False
                if to_append:  # no $
                    if no_text:
                        if len(num) > 3:
                            text_tokens.append(self.parse_numbers(num))
                            text_tokens[
                                rmv_index] = " "  # remove num from list
                    else:
                        text_tokens.append(new_num)
                if found_fractions:  # delete fractions
                    del text_tokens[rmv_index]
                    del text_tokens[rmv_index - 1]
        """punctuations = '''!(-+—[]{};:'",)<>,./?^&*_’~|=→"”“'''  # removes relevant punctuations and http and //short url
        index_count = 0
        for word in text_tokens:
            to_delete = False
            if len(word) > 1 and word.find('-') != -1:  # contains '-'
                text_tokens.extend(word.split('-'))
                text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('…') != -1:  # contains '…'
                if to_delete == False:
                    text_tokens.extend(word.split('…'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('_') != -1:  # contains '_'
                if to_delete == False:
                    text_tokens.extend(word.split('_'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('+') != -1:  # contains '+'
                if to_delete == False:
                    text_tokens.extend(word.split('+'))
                    text_tokens.remove(word)
                to_delete = True
            if len(word) > 1 and word.find('/') != -1 and not (word[0] == '/' and word[1] == '/'):  # contains '/'
                if to_delete == False:
                    text_tokens.extend(word.split('/'))
                    text_tokens.remove(word)
                to_delete = True
            if to_delete == False:
                if word in punctuations:
                    i = text_tokens.index(word)
                    text_tokens[i] = " "
                elif word == "http" or word == "https" or word == "http..." or word == "https..." or word == "RT" or word == "rt":
                    i2 = text_tokens.index(word)
                    text_tokens[i2] = " "
                elif len(word) > 1 and word[0] == '/' and word[1] == '/':
                    i3 = text_tokens.index(word)
                    text_tokens[i3] = " "
                else:
                    text_tokens[index_count] = ''.join([i if ord(i) < 128 else '' for i in word])
            index_count += 1
        text_tokens[:] = [x for x in text_tokens if
                          x != " " and x != ".." and x != "..." and x != "...." and x != "....." and x != "......" and
                          x != "``" and x != "''" and x != "'s" and x != "'m" and x != "n't" and x != "." and x != ""
                          and x != "'re" and x != "__" and x != "_" and x != "___" and x != "," and x != "!"]"""
        ##############################################################################################
        # find punctuations
        new_words = []
        regex_pattern_for_num = '.*\d\.\d.*'
        regex_pattern_for_punctuation = 't.co.*|\'m|\'s|n\'t|\'re|\(|\)|\!|\-|\+|\[|\]|\{|\}|\;|\:|\'|\,|\<|\>|\?|\"|\^|\&|\*|\_|\~|\`|\||\=|\→|\/|\”|\“|\’|\—|\.|\``|\\\\|http.*|https.*|^RT$|^rt$'

        for word in text_tokens:
            # if term is a number in form ...d.d.. exp 230.3K - add to list
            if re.match(regex_pattern_for_num, word):
                new_words.append(word)
                continue
            # else - remove all punctuation from the term
            else:
                word = re.sub(regex_pattern_for_punctuation,
                              '',
                              word,
                              flags=re.IGNORECASE)
                word = ''.join([i if ord(i) < 128 else '' for i in word])
                if word == '' or word == ' ':
                    continue

            new_words.append(word)
        text_tokens = new_words
        ##############################################################################################
        # find HASHTAGS
        # TODO: #whereIsKCR combined
        if "#" in text_tokens:
            index_list3 = [n for n, x in enumerate(text_tokens) if x == '#']
            for index in index_list3:
                if index + 1 < len(text_tokens):
                    if text_tokens[index + 1] != '#' and text_tokens[
                            index +
                            1][0] != '@' and text_tokens[index + 1].find(
                                "#") == -1:  #next word is not # and not @
                        if text_tokens[index +
                                       1].find('_') == -1:  # not contains '_'
                            new_term = text_tokens[index] + text_tokens[index +
                                                                        1]
                            text_tokens.append(new_term)
            for sign in range(
                    len(index_list3
                        )):  # deletes all '#' and the word after it from list
                rmv_index = text_tokens.index('#')
                if rmv_index + 1 < len(text_tokens) and text_tokens[rmv_index + 1] != '#'\
                        and text_tokens[rmv_index + 1][0] != '@' and text_tokens[rmv_index + 1].find("#") == -1:
                    word_val = text_tokens[rmv_index + 1]
                    if not word_val.isupper() and not word_val.islower(
                    ) and word_val.find('_') == -1:  # split uppercase
                        list_of_words = re.findall('[A-Z][^A-Z]*', word_val)
                        for word in list_of_words:
                            text_tokens.append(word)
                    if word_val.find('_') != -1:  # split '_'
                        list_of_words = word_val.split('_')
                        new_word = "#"
                        for word in list_of_words:
                            new_word += word
                            text_tokens.append(word)  # appends each word
                        text_tokens.append(new_word)  # appends #word
                    if text_tokens[rmv_index + 1][0] != '@' and (
                        (not word_val.isupper() and not word_val.islower())
                            or word_val.islower() or
                        (word_val.find('_') != -1)):  #TODO: delete #fuck_you
                        del text_tokens[rmv_index + 1]
                text_tokens.remove('#')
##############################################################################################
# add fractions
        text_tokens.extend(fractions_list)
        ##############################################################################################
        # remove stop_words
        text_tokens_without_stopwords = [
            w.lower() for w in text_tokens if w not in self.stop_words
        ]
        # print(text_tokens)
        # print(text_tokens_without_stopwords)
        ##############################################################################################
        # if stemmer
        to_stem = self.config.get__toStem()
        if to_stem:
            stem_text_tokens_without_stopwords = []
            for token in text_tokens_without_stopwords:
                stem_token = Stemmer().stem_term(token)
                stem_text_tokens_without_stopwords.append(stem_token)
            #print(stem_text_tokens_without_stopwords)
            return stem_text_tokens_without_stopwords

        return text_tokens_without_stopwords
Пример #26
0
    def parse_sentence(self, text):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        after_parse = []
        # tokenizer:
        stemmer = Stemmer()
        tweet_tokenizer = TweetTokenizer()
        text_tokens = tweet_tokenizer.tokenize(
            re.sub(r'[^\x00-\x7f]', r' ', text))

        symbols = '.,...,:;{}()[]"*?!&$%+-_=></\''
        text_tokens_without_stopwords = [
            w for w in text_tokens
            if w.lower() not in self.stop_words and w not in symbols
        ]

        # separate -
        j = 0
        while j < len(text_tokens_without_stopwords):
            if '-' in text_tokens_without_stopwords[
                    j] and 'http' not in text_tokens_without_stopwords[j]:
                if text_tokens_without_stopwords[j][0] == '-':
                    j += 1
                    continue
                temp = text_tokens_without_stopwords[j].split('-')
                text_tokens_without_stopwords.remove(
                    text_tokens_without_stopwords[j])
                text_tokens_without_stopwords.insert(j, temp[0])
                if temp[1] != '':
                    text_tokens_without_stopwords.insert(j + 1, temp[1])
                j += 1

            j += 1

        i = 0
        covid = ['COVID', 'COVID19', 'CORONAVIRUS', 'CORONA']

        while i < len(text_tokens_without_stopwords):
            # covid rule
            if any(covid_exp in text_tokens_without_stopwords[i].upper()
                   for covid_exp in covid):
                if i < len(text_tokens_without_stopwords) - 1 and (
                        text_tokens_without_stopwords[i + 1] == '19'
                        or text_tokens_without_stopwords[i + 1].upper()
                        == 'VIRUS'):
                    i += 1
                after_parse.append('COVID19')

            # hashtag
            elif text_tokens_without_stopwords[i][0] == '#':
                hashtag = self.parse_hashtags(text_tokens_without_stopwords[i])
                after_parse.extend(hashtag)

            # tagging
            elif text_tokens_without_stopwords[i][0] == '@':
                tag = self.parse_tagging(text_tokens_without_stopwords[i])
                after_parse.append(tag)

            # url
            elif 'http' in text_tokens_without_stopwords[i]:
                url = self.parse_url(text_tokens_without_stopwords[i])
                after_parse.extend(url)

            # percent
            elif (i < len(text_tokens_without_stopwords) - 2 and (text_tokens_without_stopwords[i + 1] == 'percent' or
                                                                  text_tokens_without_stopwords[
                                                                      i + 1] == 'percentage')) or \
                    text_tokens_without_stopwords[i][-1] == '%':
                if not text_tokens_without_stopwords[i][-1] == '%':
                    i += 1
                percentage = self.parse_percentages(
                    text_tokens_without_stopwords[i])
                after_parse.append(percentage)

            # numbers
            elif text_tokens_without_stopwords[i].replace(',', '').replace(
                    '.', '', 1).isdigit():
                if '.' in text_tokens_without_stopwords[i]:
                    curr_num = float(text_tokens_without_stopwords[i].replace(
                        ',', ''))
                else:
                    curr_num = int(text_tokens_without_stopwords[i].replace(
                        ',', ''))

                if i == len(
                        text_tokens_without_stopwords
                ) - 1:  # if this is the last word, send only the current word
                    number = self.parse_numbers(curr_num, '')
                else:
                    number = self.parse_numbers(
                        curr_num, text_tokens_without_stopwords[i + 1])
                    if text_tokens_without_stopwords[i + 1].lower() == 'thousand' or text_tokens_without_stopwords[
                        i + 1].lower() == \
                            'million' or text_tokens_without_stopwords[i + 1].lower() == 'billion':
                        i += 1

                after_parse.append(number)

            elif text_tokens_without_stopwords[i].isupper():
                after_parse.append(text_tokens_without_stopwords[i])

            # names and entities
            elif text_tokens_without_stopwords[i][0].isupper():
                names_and_entities = self.parse_names_and_entities(
                    text_tokens_without_stopwords[i:])
                after_parse.append(names_and_entities[0])
                i += names_and_entities[1] - 1

            else:
                after_parse.append(text_tokens_without_stopwords[i])

            i += 1

        # while '' in after_parse: after_parse.remove('')
        after_parse = [w for w in after_parse if w not in symbols or w != '']

        # after_stem = []
        # for token in after_parse:
        #     after_stem.append(stemmer.stem_term(token))
        # after_parse = after_stem

        return after_parse
Пример #27
0
    else:
        return check_word


def check_case(check_word):
    remove_word_list = [
        "द्वारा", "बाट", "देखि", "लाई", "निम्ति", "मा", "को", "ले", "हरु"
    ]
    return [
        subfunc(check_word, remove_words)
        if len(list(check_word[:-len(remove_words)])) >= 3 else check_word
        for remove_words in remove_word_list
    ][0]


st = Stemmer()
words = [st.stem(case) for case in iwords]

print(words[:100])

file = open(os.path.join(VISUAL_FLD, "vocab_nep.tsv"), "w")
from collections import Counter
dictionary = dict()
count = [('UNK', -1)]
index = 0
count.extend(Counter(words).most_common(VOCAB_SIZE - 1))
for word, _ in count:
    dictionary[word] = index
    index += 1
    file.write(word + '\n')
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
Пример #28
0
 def pos_tag(self, sentence):
     stemmer = Stemmer()
     sent = stemmer.stem(sentence)
     sent = WordTokenizer(sent)
     tags = self.tag(sent)
     return tags
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.dictionary_term_index = {}
     self.array_names_and_entities = {}
     self.porter_stemmer = Stemmer()
Пример #30
0
    def _get_features(self, tokens, idx):
        stemmer = Stemmer()
        numbs = numbers.values()
        puncts = punctuations.values()

        token = stemmer.stem(tokens[idx])
        feature_list = []

        if not token:
            return feature_list

        for number in numbs:
            if number in list(token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(token):
                feature_list.append("PUNCTUATION")

        feature_list.append("WORD_" + token)

        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
            feature_list.append("PRE_" + token[:3])

        if idx >= 1:
            previous_token = stemmer.stem(tokens[idx - 1])
            if not previous_token:
                return feature_list

            for number in numbs:
                if number in list(previous_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(previous_token):
                    feature_list.append("PUNCTUATION")

            if len(previous_token) > 1:
                feature_list.append("SUF_" + previous_token[-1:])
                feature_list.append("PRE_" + previous_token[:1])
            if len(previous_token) > 2:
                feature_list.append("SUF_" + previous_token[-2:])
                feature_list.append("PRE_" + previous_token[:2])
            if len(previous_token) > 3:
                feature_list.append("SUF_" + previous_token[-3:])
                feature_list.append("PRE_" + previous_token[:3])

            feature_list.append("PREV_WORD_" + previous_token)

        if idx >= 2:
            previous_token = stemmer.stem(tokens[idx - 2])
            if not previous_token:
                return feature_list

            for number in numbs:
                if number in list(previous_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(previous_token):
                    feature_list.append("PUNCTUATION")

            if len(previous_token) > 1:
                feature_list.append("SUF_" + previous_token[-1:])
                feature_list.append("PRE_" + previous_token[:1])
            if len(previous_token) > 2:
                feature_list.append("SUF_" + previous_token[-2:])
                feature_list.append("PRE_" + previous_token[:2])
            if len(previous_token) > 3:
                feature_list.append("SUF_" + previous_token[-3:])
                feature_list.append("PRE_" + previous_token[:3])

            feature_list.append("PREV_PREV_WORD_" + previous_token)

        if idx < len(tokens) - 1:
            next_token = stemmer.stem(tokens[idx + 1])
            if not next_token:
                return feature_list

            for number in numbs:
                if number in list(next_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(next_token):
                    feature_list.append("PUNCTUATION")

            if len(next_token) > 1:
                feature_list.append("SUF_" + next_token[-1:])
                feature_list.append("PRE_" + next_token[:1])
            if len(next_token) > 2:
                feature_list.append("SUF_" + next_token[-2:])
                feature_list.append("PRE_" + next_token[:2])
            if len(next_token) > 3:
                feature_list.append("SUF_" + next_token[-3:])
                feature_list.append("PRE_" + next_token[:3])

            feature_list.append("NEXT_WORD_" + next_token)

        if idx < len(tokens) - 2:
            next_token = stemmer.stem(tokens[idx + 2])
            if not next_token:
                return feature_list

            for number in numbs:
                if number in list(next_token):
                    feature_list.append("HAS_NUM")

            for punctuation in puncts:
                if punctuation in list(next_token):
                    feature_list.append("PUNCTUATION")

            if len(next_token) > 1:
                feature_list.append("SUF_" + next_token[-1:])
                feature_list.append("PRE_" + next_token[:1])
            if len(next_token) > 2:
                feature_list.append("SUF_" + next_token[-2:])
                feature_list.append("PRE_" + next_token[:2])
            if len(next_token) > 3:
                feature_list.append("SUF_" + next_token[-3:])
                feature_list.append("PRE_" + next_token[:3])

            feature_list.append("NEXT_NEXT_WORD_" + next_token)

        return feature_list