def __init__(self, docs_dir, docs_size): self.docLoader = DocLoader(docs_dir, docs_size) self.tokenizer = Tokenizer() self.stemmer = Stemmer() self.dictionary = Dictionary(load=False) self._clean() self._setup(docs_size)
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words.extend( ['rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`", r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d', '-', '--']) self.stop_words_dict = dict.fromkeys(self.stop_words) self.text_tokens = None self.stemmer = None if stemming: self.stemmer = Stemmer() self.hashtag_split_pattern = re.compile(r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))') self.take_off_non_latin = re.compile( pattern=r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]') self.left_slash_pattern = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$') self.right_slash_pattern = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$') self.days_dict = {"Sat": "saturday", "Sun": "sunday", "Mon": "monday", "Tue": "tuesday", "Wed": "wednsday", "Thu": "thursday", "Fri": "friday"} self.months_dict = {"Jul": ("july", "07"), "Aug": ("august", "08")} self.kbm_shorts = {"k": None, "m": None, "b": None, "K": None, "M": None, "B": None}
def __init__(self, stem): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'rt', "don't", '-', '&', 'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co', 'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre', 'would', 'might' ]) self.stop_words_dict = { self.stop_words[i]: 0 for i in range(0, len(self.stop_words)) } # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0} # self.stop_words_dict.update(self.extra_stop_words) self.term_dict = {} self.toStem = stem self.text_tokens = [] if self.toStem: self.stemmer = Stemmer()
def __init__(self, config=None): self.tmp_for_entites = {} self.stop_words = stopwords.words('english') + [ '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':', '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am', 'a', 'even', 'every', 'everyone', 'rt', 'RT' ] self.global_dict = {} #value=number of docs self.post_dict = { } # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf] self.entities = {} self.path_stop_words = [ 'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW' ] self.corona_list = [ "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19', 'corona virus', 'virus corona', 'corona_virus', 'virus_corona', "virus" ] self.config = config self.trump = [ "donald", "donald trump", "trump donald", "president", "trump_donald", "donald_trump", "trump-donald", "donald-trump" ] self.stemmer = None if self.config.toStem: self.stemmer = Stemmer()
def stem(self, min_word_count=10): stemmer = Stemmer({w:n for (w,n) in self.vocab.items() if n >= min_word_count}) for mail in self.mails: mail.sents = [[stemmer.stem(w) for w in sent] for sent in mail.sents] self.stemmer = stemmer
def test_VC_measure(self): """Tests the VC measure.""" stemmer = Stemmer() for word, measure in VC_DATA.items(): self.failUnless(stemmer.m(word) == measure, "Measure test failed for word '%s' calculated (%d) \ should have been (%d)" % (word, stemmer.m(word), measure))
def add_new_doc(self, document, documents_list_length=10000): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param document: a document need to be indexed. :return: - """ try: document_dictionary = document.term_doc_dictionary # self.countDoc += 1 for term in document_dictionary.keys(): if self.stemming == 'y': my_stemmer = Stemmer() term = my_stemmer.stem_term(term) # Update inverted index and posting if term not in self.inverted_idx.keys(): self.inverted_idx[term] = [ 1, [(document_dictionary[term], document.tweet_id)] ] # amount of doc, freq in the doc, doc id. else: self.inverted_idx[term][0] += 1 # amount of doc self.inverted_idx[term][1].append( (document_dictionary[term], document.tweet_id)) # freq in the doc # doc id if term not in self.postingDict.keys(): self.postingDict[term] = [(document.tweet_id, document_dictionary[term])] else: self.postingDict[term].append( (document.tweet_id, document_dictionary[term])) # self.countTweet -= 1 if document.tweet_id not in self.tweet_dict.keys(): self.tweet_dict[document.tweet_id] = [ [term, document_dictionary[term]], 1, 0 ] # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet elif document_dictionary[term] > self.tweet_dict[ document.tweet_id][0][ 1]: # tweet exist, compering between freq in two terms if self.tweet_dict[document.tweet_id][0][ 1] == 1: # before change term check if the last term is unique self.tweet_dict[document.tweet_id][ 1] += 1 # last term is unique: add to the amount of uniqe terms in tweet self.tweet_dict[document.tweet_id][0] = [ term, document_dictionary[term] ] # change between the terms self.tweet_dict[document.tweet_id][2] += 1 elif document_dictionary[ term] == 1: # tweet exist, not most common, check if unique self.tweet_dict[document.tweet_id][1] += 1 self.tweet_dict[document.tweet_id][2] += 1 except: # print('problem in indexer : add_new_doc') # print(traceback.print_exc()) pass
def __init__(self, config): self.word_dict = {} self.stemmer = Stemmer(config.stemming) self.stop_words = [ self.stemmer.stem_term(word) for word in stopwords.words('english') ] + ['rt', 't.co', 'https'] self.rules = config.parser_rules self.spell = SpellChecker() self.min_length = config.min_length
def test_VC_measure(self): """Tests the VC measure.""" stemmer = Stemmer() for word, measure in VC_DATA.items(): self.failUnless( stemmer.m(word) == measure, "Measure test failed for word '%s' calculated (%d) \ should have been (%d)" % (word, stemmer.m(word), measure))
def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False): self.stemmer = Stemmer() self.with_stemmer = with_stemmer self.include_urls = include_urls self.include_quote = include_quote self.stop_words = stopwords.words('english') self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ] self.debug = debug self.timer = timer self.times = []
def test_stem(self): """Checks the final stems.""" stemmer = Stemmer() output = file('output.txt') for word in file('voc.txt'): word = word.strip() stem = output.next().strip() self.failUnless(stemmer.stem(word) == stem, "Test failed for word \'%s\' stemmed "\ "to %s should have been %s"\ % (word, stemmer.stemmed, stem))
def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend([ r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``", r"'", r"`", '"' ]) self.stop_words.extend([ 'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t' ]) self.stop_words.extend(['1️⃣.1️⃣2️⃣']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_pattern = re.compile('http\S+') self.url_www_pattern = re.compile("[/://?=]") # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.non_latin_pattern = re.compile( pattern= r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]' ) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' ) # TODO - fix emoji to include all emojis self.emojis_pattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00010000-\U0010ffff" u"\U0001f926-\U0001f937" u"\U000024C2-\U0001F251" u"\U00002702-\U000027B0" u"\u2640-\u2642" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" u"\u3030" u"\u2600-\u2B55" u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3" "]+", flags=re.UNICODE)
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words += ["rt", "http", "https", "www", "twitter.com"] # TODO: check & self.terms = set() self.nonstopwords = 0 self.max_tf = 0 self.toStem = stemming self.entities = {} if self.toStem: self.stemmer = Stemmer()
def __init__(self, config=None, advanced=False): # stopwords_to_add = ['rt'] self.english_word = words.words() self.stop_words = stopwords.words('english') puncs_to_add = ['...', '', '\'', '“', '”', '’', '…'] self.punctuators = [punc for punc in string.punctuation] + puncs_to_add self.tt = TweetTokenizer() self.stemmer = Stemmer() self.need_stemming = config.toStem if isinstance( config, ConfigClass) else False self.caps_dict = {} self.rules_dict = {} self.advanced = advanced
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indice = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] retweet_indice = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] quoted_indice = doc_as_list[10] retweet_quoted_text = doc_as_list[11] retweet_quoted_url = doc_as_list[12] retweet_quoted_indice = doc_as_list[13] term_dict = {} tokenized_text = self.parse_sentence(full_text) tokenized_quote = self.parse_sentence(quote_text) tokenized_url = self.handle_url(url) doc_length = len( tokenized_text) # after text operations - length of full_text new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote if self.stemming is True: s = Stemmer() for token in new_tokenized_text: new_tokenized_text.append(s.stem_term(token)) new_tokenized_text.remove(token) for term in new_tokenized_text: if term is not "": # or (term.isalpha() and len(term) == 1) if term not in term_dict: term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document
def test_stemmer(self): line = "мамочка свари суп" #tok = Tokenizer().tokenize_alph(line) fact = list(Stemmer().stem(Token(0, 7, line, 'a'), 4, line)) check = [Token(0, 7, line, 'a'), Token(0, 6, line, 'a'), Token(0, 5, line, 'a'), Token(0, 4, line, 'a'), Token(0, 3, line, "a")] fact1 = list(Stemmer().stem(Token(14, 17, line, "a"), 4, line)) check1 = [Token(14, 17, line, "a")] self.assertEqual(fact, check) self.assertEqual(fact1, check1)
def test(self): print 'Starting analysis' for trie_name in self.trie_files: print 'Starting', trie_name correct_number = 0 all_number = 0 s = Stemmer(self.plp, filename=trie_name, word_type=None) corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8') result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8') result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n') corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n') # for k, v in self.cities.iteritems(): cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8') for city in cities: k = city.split(';')[1].strip() v = city.split(';')[0].strip() all_number += 1 basic_form = '' # word_labels = [] # if k.__contains__('-'): # for city_parts in v.split('-'): # b = s.find_basic_form(city_parts) # basic_form += b.basic_form + '-' # word_labels.append(b.word_labels) # basic_form = basic_form[0:basic_form.__len__() - 1] # else: # for city_parts in v.split(' '): # b = s.find_basic_form(city_parts) # basic_form += b.basic_form + ' ' # word_labels.append(b.word_labels) basic_form = s.find_basic_form(v).basic_form.strip() if basic_form != k: # if basic_form == k: result_file.write(v + ';' + k + ';' + basic_form + ';') # for w_label in word_labels: # result_file.write(self.find_most_label(w_label) + ' ') result_file.write('\n') else: # corrects_file.write(v + ';' + k + ';' + basic_form + ';') # for label in s.find_labels(word_labels): # corrects_file.write(label + ' ') # corrects_file.write('\n') correct_number += 1 result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n') result_file.write( str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number)) print 'Done', trie_name
def __init__(self, stemming=None): """ constructor for this class :param stemming: """ self.stop_words = stopwords.words('english') self.stemmer = None if stemming: self.stemmer = Stemmer() self.corona_list = [ "SARS", "sars", "Severe Acute Respiratory Syndrome", "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV", "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID", "covid", "Covid", "COVID-19", "covid-19", "#coronavirus", "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS" ]
class SearchEngine: _dictionary: Dictionary _tokenizer: Tokenizer _stemmer: Stemmer _query_result: QueryResult def __init__(self): self._dictionary = Dictionary(load=True) self._tokenizer = Tokenizer() self._stemmer = Stemmer() self._query_result = QueryResult() print(self._dictionary) def _search_for_token(self, token: Token): pl = self._dictionary.getPostingList(token.getWord()) print(pl) if pl is not None: self._query_result.addToResults(token, pl) def listen(self): inp = input("Enter Your Query: ") # inp = "هفته" query_tokens = self._tokenizer.tokenizeDoc(inp) normalized_query_tokens = self._stemmer.normalize_list(query_tokens) for p in normalized_query_tokens: self._search_for_token(p) self._query_result.buildCandidates() self._query_result.printKBestCandidates()
class Indexer: def __init__(self, docs_dir, docs_size): self.docLoader = DocLoader(docs_dir, docs_size) self.tokenizer = Tokenizer() self.stemmer = Stemmer() self.dictionary = Dictionary(load=False) self._clean() self._setup(docs_size) def _setup(self, docs_size): for doc_id in range(1, docs_size + 1): doc = self.docLoader.getDoc(doc_id) tokens = self.tokenizer.tokenizeDoc(doc) print("tokens: ") for token in tokens: print(token) normalized_words = self.stemmer.normalize_list(tokens) print("normalized_words: ") for token in normalized_words: print(token) for token in normalized_words: self.dictionary.addToken(token, doc_id) @staticmethod def _clean(): if os.path.exists(os.path.dirname("./dist")): try: shutil.rmtree("./dist") except (FileNotFoundError, FileExistsError) as e: print("error")
def __init__(self, stemming=0): """ This function initiate the fields of Parse, init the stemmer and entering stop words :param stemming: the boolean value is stem is needed (optional) """ self.stemming = stemming self.stemmer = Stemmer() # self.stop_words = frozenset(stopwords.words('english')) ?????????????????????????????????????????????????????? self.stop_words = stopwords.words('english') self.stop_words.extend([ ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(', ')', '*', '+', '=' '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`', '[', ']', '{', '}' ])
def test_stemmer_flex(self): line = "мамочка свари суп" fact = list(Stemmer().stem_flex(Token(0, 8, "мамочка свари суп", "a"))) check = [Token(0, 8, line, 'a'), Token(0, 7, line, 'a')] self.assertEqual(fact, check)
def __init__(self, rootPath="", inputFolder=""): self.metadata = Metadata() self.stopper = Stopper() stopwords_folder = os.path.join(rootPath, "stopwords") print("Preprocessor root path: ", rootPath) self.stopper.load_stopwords(stopwords_folder) self.normalizer_tokenizer = NormalizationTokenization() self.stemmer = Stemmer() self.p1_path = "" self.p2_path = "" self.p3_path = "" self.rootPath = rootPath self.inputFolder = inputFolder
def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend(['RT']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_removal_pattern = re.compile(r'(https?://[^\s]+)') # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' )
def __init__(self, stemming=False): self.stemming = stemming self.toStem = Stemmer() self.terms_dic_to_document = {} #self.lower_set = set() #self.upper_set = set() self.numberList = { "thousand": 'K', "million": 'M', "billion": 'B', "percentage": '%', "percent": '%', "dollar": '$' } self.stop_words = stopwords.words('english') # contains of all stop words acording to thiers first letter self.dict_stop_words = { 'a': [], 'b': [], 'c': [], 'd': [], 'e': [], 'f': [], 'g': [], 'h': [], 'i': [], 'j': [], 'k': [], 'l': [], 'm': [], 'n': [], 'o': [], 'p': [], 'q': [], 'r': [], 's': [], 't': [], 'u': [], 'v': [], 'w': [], 'x': [], 'y': [], 'z': [] } # build the dic of stop Word for w in self.stop_words: self.dict_stop_words[w[0]].append(w) # all operator we dont want and all parentheses character and all separators character self.skip_list = { ',', ';', ':', ' ', '\n', '(', ')', '[', ']', '{', '}', '*', '+', '-', '/', '<', '>', '&', '=', '|', '~', '"' } # all wired symbols self.wird_symbols = { '!', '#', '$', '%', '&', '(', ')', ',', '*', '+', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', "'\'", ']', '^', '`', '{', '|', '}', '~', '}' }
def main(args): dl = DataLoader() stem = Stemmer('porter') # files is a list of files, which are lists of lines, which are lists of words files = [{element[0]: stem.stem(element[1]) for element in dl.load_data(file) if stem.stem(element[1])} for file in args] for file, arg in zip(files, args): print('Processing file {}...'.format(arg)) file = {k: list(v) for k, v in file.items()} print('Data Clusterer') test_clusterer(DataClusterer(list(file.values()), 'euclidean'), file) print('-'*64) print('Description Clusterer') test_clusterer(DescriptionClusterer(list(file.values()), 'cosine'), file)
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`", r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people', 'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', 'us', 'like' ]) # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like' self.stop_words_dict = dict.fromkeys(self.stop_words) self.text_tokens = None self.stemmer = None if stemming: self.stemmer = Stemmer()
def __init__(self, corpus=None, cxp=True, swr=True, nr=True, stem=True): if corpus != None: self.corpus_path = Path(str(corpus)) else: self.corpus_path = None self.contraction_expansion_flag = False self.stop_word_flag = False self.noise_removal_flag = False self.stemmer_flag = False if cxp: self.contraction_expansion_flag = True self.contraction_expander = ContractionExpander() if swr: self.stop_word_flag = True self.stop_word_remover = StopWordRemover() if nr: self.noise_removal_flag = True self.noise_remover = NoiseRemover() if stem: self.stemmer_flag = True self.stemmer = Stemmer()
def predict(self, doc): # Prepare document doc = self.clean(doc) # Getting class with highly score score = [] for cat in self.C: probability = math.log10(self.DC[cat] / self.D) for word in doc.split(): if len(word) > 2: cur_word = Stemmer.stem(u'{}'.format(word)) probability += math.log10( (self.WiC[cat].get(cur_word, 0) + 1) / (len(self.W) + self.WC[cat])) score.append(probability) return self.C[score.index(max(score))]
def train(self, doc, category): # Prepare document doc = self.clean(doc) # Update classifier: # Update D self.D += 1 # Update C & DC if category not in self.C: self.C.append(category) self.DC[category] = 1 else: self.DC[category] += 1 for word in doc.split(): if len(word) > 2: # 'Normalize' word cur_word = Stemmer.stem(u'{}'.format(word)) # Update W if cur_word not in self.W: self.W.append(cur_word) # Update WC if category not in self.WC.keys(): self.WC[category] = 1 else: self.WC[category] += 1 # Update Wic if category not in self.WiC.keys(): self.WiC[category] = {} if cur_word not in self.WiC[category].keys(): self.WiC[category][cur_word] = 1 else: self.WiC[category][cur_word] += 1
def search_and_rank_query(queries, inverted_index, k, lda): #print("start:", datetime.now()) # config = ConfigClass() indexer = Indexer(config) # indexer = Indexer(config) to_stem = config.get__toStem() # to_stem = config.get__toStem() queries_list = [] if type(queries) is list: # if queries is a list for query in queries: queries_list.append(query) if type(queries) is str: # if queries is a text file with open(queries, encoding='utf-8') as f: for line in f: if line != "\n": queries_list.append(line) all_results = [] query_num = 1 tweet_id_num = 1 for query in queries_list: p = Parse(config) # parse LDA query tokenized_query = p.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] if not to_stem: tokenized_query.append(word) else: stem_word = Stemmer().stem_term(word) tokenized_query.append(stem_word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: tokenized_query.append(term) counter += len_term #print(tokenized_query) # WordNet query wn = WordNet_ranker(tokenized_query) WordNet_query = wn.extend_query() #print("WordNet_query", WordNet_query) searcher = Searcher(inverted_index) #print("inverted_index", len(inverted_index)) # find relevant_docs relevant_docs = searcher.relevant_docs_from_posting(WordNet_query) #print("relevant", len(relevant_docs)) # find LDA relevant cosine_dict = lda.prob(tokenized_query) #print("cosine dict", len(cosine_dict)) dict_of_cosine_tweets = {} #list out keys and values separately key_list = list(indexer.tweet_line_dict.keys()) val_list = list(indexer.tweet_line_dict.values()) for index in cosine_dict.keys(): # find the tweet id dict_of_cosine_tweets[key_list[val_list.index( index)]] = cosine_dict[index] #print("finish_topic relevant", len(dict_of_cosine_tweets)) final_dict = {} for tweet_id in dict_of_cosine_tweets.keys(): if k > len(final_dict): if tweet_id in relevant_docs: final_dict[tweet_id] = 0 final_dict[tweet_id] += (relevant_docs[tweet_id] + dict_of_cosine_tweets[tweet_id]) sorted_cosine_tweets = { k: v for k, v in sorted( final_dict.items(), key=lambda item: item[1], reverse=True) } final_tweets = list(sorted_cosine_tweets.keys()) #print("final before add K", len(final_tweets)) if k > len(final_tweets): for key in relevant_docs.keys(): if key not in final_dict: if k > len(final_tweets): final_tweets.append(key) if k == len(final_tweets): break #print("final after K", len(final_tweets)) #print("relevant", relevant_docs) #print("sorted_cosine_tweets", sorted_cosine_tweets) """for tweet in relevant_docs.keys(): if tweet in list_of_cosine_tweets: if len(final_tweets) < k: final_tweets.append(tweet) if len(final_tweets) < k: sorted_cosine_tweets = {k: v for k, v in sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)} for key in sorted_cosine_tweets: if k > len(final_tweets) and key not in final_tweets: final_tweets.append(key) else: break""" # write the results into csv file tweet_id_num = 1 s = "" with open('results.csv', 'a', encoding='utf-8') as fp: for p in final_tweets: s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" + str(tweet_id_num) + "}" + "\n") tweet_id_num += 1 fp.write(s) query_num += 1 all_results.append(final_tweets) #print("end:", datetime.now()) # return top K of final_tweets return all_results
#query = pattern.getPhoneticCode() #document = searchEntry5.getPhoneticCode() #print query #print document #print " " #print pattern.data.comparePhoneticCodeLists(query, document) #varList = ["halten", "hielt", "gehalt", "haltbar"] #so = Stemmer("") #print so.successorVariety ("gehalten", varList) #varObject = Phonetics("") #sv = varObject.calcSuccVarietyList(varList) #print sv #svm = varObject.calcSuccVarietyMerge(sv) #print svm #print varObject.calcSuccVarietyCount(svm) #text = Advas(["die Kinder freuen sich über die Kastanien"], "") #keywordList = ["die", "der", "das", "sich"] #print text.isLanguageByKeywords (keywordList) #text = Advas(["Schule"], "") #print text.getSynonyms("/home/frank/projekte/openthesaurus/openthesaurus.txt", "") #print text.isSynonymOf("Bildungszentrum", "/home/frank/projekte/openthesaurus/openthesaurus.txt", "") # -- ngram stemmer stemmerObject = Stemmer("") print stemmerObject.ngramStemmer( ["halten", "hielt", "halter", "halt", "gehalten"], 2, 0.4)
from stopwordsremover import StopWordsRemover from texthandler import TextHandler from stemmer import Stemmer from tfidf import TFIDFHandler from searchhandler import SearchHandler # Text to be converted text = """ The 2019–20 coronavirus pandemic is an ongoing pandemic of coronavirus disease 2019 (COVID-19), caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The outbreak was first noted in Wuhan, Hubei province, China, in December 2019. The World Health Organization (WHO) declared the outbreak to be a Public Health Emergency of International Concern on 30 January 2020 and recognized it as a pandemic on 11 March 2020. As of 6 April 2020, more than 1,270,000 cases of COVID-19 have been reported in over 200 countries and territories, resulting in approximately 69,400 deaths. More than 260,000 people have recovered. """ # Remove Stopwords or unnessary words such as is, a, and removed_stopwords_text = StopWordsRemover.remove(text) # Stem to reduce inflected words to their word stem, base or root form stemmed_text = Stemmer.stem(removed_stopwords_text) # Counts number of words has appeared in the document sanitized_text = TextHandler.WordCounter(stemmed_text) book1 = { "ID": '1', "Title": "Covid", "Subtitle": "viruses", "Author": "author 1", "RawText": text, "SanitizedText": sanitized_text, "RemovedStopWordsText": removed_stopwords_text, "TotalNoOfTerms": len(text.lower().split(" ")), "TFIDF": 0, } text2 = """ Artificial neural networks (ANN) or connectionist systems are computing systems vaguely inspired by the biological neural networks that constitute animal brains. Such systems "learn" to perform tasks by considering examples, generally without being programmed with task-specific rules. For example, in image recognition, they might learn to identify images that contain cats by analyzing example images that have been manually labeled as "cat" or "no cat" and using the results to identify cats in other images. They do this without any prior knowledge of cats, for example, that they have fur, tails, whiskers and cat-like faces. Instead, they automatically generate identifying characteristics from the examples that they process.
"duree", "ville", "lieu", "labo", ] outdir = "archives_SFBI_AnnotationManuelle" mails = list(mailLoaderGen()) words = Counter() for mail in mails: mail.sents = list(iterTokenizedSentences(mail.description)) for sent in mail.sents: words.update(sent) stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10)) for m in mails: outf = outdir + m.mailfile.strip("archives_SFBI") d = m.__dict__ d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y") with open(outf, "wt") as f: d["from"] = d.pop("sender") if m.sfbi: ce = d["contact-email"] ce = "\t".join(ce) if type(ce) is set else ce d["contact-email"] = ce.replace(" [dot] ", ".").replace("[at]", "@") cn = d["contact-nom"] d["contact-nom"] = "\t".join(cn) if type(cn) is set else cn
def parse_sentence(self, text, tweet_id): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ # print(text) text_tokens = word_tokenize(text) if text_tokens[0] == 'RT': return [] # find TAGS if "@" in text_tokens: index_list1 = [n for n, x in enumerate(text_tokens) if x == '@'] counter = 0 for index in index_list1: if index + 1 < len(text_tokens): if text_tokens[index + 1] != '@': new_term = text_tokens[index] + text_tokens[index + 1] text_tokens.append(new_term) counter += 1 for sign in range( counter ): # deletes all '@' and the word after it from list rmv_index = text_tokens.index('@') if rmv_index + 1 < len(text_tokens): if text_tokens[rmv_index + 1] != '@': del text_tokens[rmv_index + 1] else: del text_tokens[rmv_index + 1] del text_tokens[rmv_index + 1] text_tokens.remove('@') ############################################################################################## # find PERCENTAGES if "%" or "percent" or "Percent" or "percentage" or "Percentage" in text_tokens: index_list2 = [ n for n, x in enumerate(text_tokens) if x == '%' or x == 'percent' or x == "percentage" or x == 'Percent' or x == "Percentage" ] counter2 = 0 for index in index_list2: if index - 1 >= 0: if not re.search('[a-zA-Z]', text_tokens[index - 1]): new_term = text_tokens[index - 1] + '%' text_tokens.append(new_term) if text_tokens[index] == '%': counter2 += 1 while counter2 > 0: # deletes all '%' and the word after it from list rmv_index = text_tokens.index('%') if rmv_index + 1 < len(text_tokens) and text_tokens[ rmv_index + 1] == '%': #if %% del text_tokens[rmv_index + 1] counter2 -= 1 if rmv_index - 1 >= 0 and not re.search( '[a-zA-Z]', text_tokens[rmv_index - 1]): #is number del text_tokens[rmv_index] del text_tokens[rmv_index - 1] counter2 -= 1 ############################################################################################## # finding terms, entities and capital letter self.parse_term(text_tokens, tweet_id) ############################################################################################## # find NUMBERS numbers = [] for item in text_tokens: #([0-9]+[,.]+[0-9]+) item.isnumeric() or item.isdigit() or item.isdecimal() or if re.findall("^\d+$|^[0-9]{1,3}([,.\/][0-9]{1,3}){0,6}$", item) and not re.search( '[a-zA-Z]', item): #^\d+$|^[0-9]{1,3}([,.][0-9]{1,3})?$ if item.find('-') == -1 and item.find('€') == -1 and item.find( '£') == -1 and item.find('%') == -1 and item.find( '¢') == -1 and item.find('~') == -1 and item.find( '+') == -1 and item.find( '/') <= 1 and item.find("'") == -1: if item.find(',') == -1: numbers.append(item) elif item.find(',') != -1 and re.findall( "^([0-9]{1,3})(,[0-9]{3})*$", item): numbers.append(item) # if len(numbers) >0: # print(numbers) fractions_list = [] for num in numbers: occur = num.count('.') if occur < 2: # not a date rmv_index = text_tokens.index(num) to_append = True no_text = True found_fractions = False if text_tokens[rmv_index].find( "/") != -1 and rmv_index - 1 > 0 and text_tokens[ rmv_index - 1].isnumeric(): # if found_fractions all_fractions = text_tokens[ rmv_index - 1] + " " + text_tokens[rmv_index] fractions_list.append(all_fractions) found_fractions = True to_append = True if rmv_index + 1 < len(text_tokens): # yes text if text_tokens[rmv_index + 1] == "million" or text_tokens[rmv_index + 1] == "Million" or \ text_tokens[rmv_index + 1] == "M" or text_tokens[rmv_index + 1] == "m" or text_tokens[rmv_index + 1] == "MILLION": if len(num) < 6: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if text_tokens[rmv_index + 1] == "billion" or text_tokens[rmv_index + 1] == "Billion" or \ text_tokens[rmv_index + 1] == "B" or text_tokens[rmv_index + 1] == "b" or text_tokens[rmv_index + 1] == "BILLION": if len(num) < 9: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000000000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if text_tokens[rmv_index + 1] == "thousand" or text_tokens[rmv_index + 1] == "Thousand" or \ text_tokens[rmv_index + 1] == "K" or text_tokens[rmv_index + 1] == "k" or text_tokens[rmv_index + 1] == "THOUSAND": if len(num) < 4: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if not no_text: text_tokens[rmv_index + 1] # TODO:????????????????? if rmv_index - 1 >= 0 and text_tokens[rmv_index - 1] == '$': # yes $ if no_text: if len(num) > 3: text_tokens.append("$" + self.parse_numbers(num)) else: text_tokens.append("$" + num) text_tokens[rmv_index] = " " # remove $ from list text_tokens[rmv_index - 1] = " " else: text_tokens.append("$" + new_num) text_tokens[rmv_index - 1] = " " # remove $ from list to_append = False if to_append: # no $ if no_text: if len(num) > 3: text_tokens.append(self.parse_numbers(num)) text_tokens[ rmv_index] = " " # remove num from list else: text_tokens.append(new_num) if found_fractions: # delete fractions del text_tokens[rmv_index] del text_tokens[rmv_index - 1] """punctuations = '''!(-+—[]{};:'",)<>,./?^&*_’~|=→"”“''' # removes relevant punctuations and http and //short url index_count = 0 for word in text_tokens: to_delete = False if len(word) > 1 and word.find('-') != -1: # contains '-' text_tokens.extend(word.split('-')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('…') != -1: # contains '…' if to_delete == False: text_tokens.extend(word.split('…')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('_') != -1: # contains '_' if to_delete == False: text_tokens.extend(word.split('_')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('+') != -1: # contains '+' if to_delete == False: text_tokens.extend(word.split('+')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('/') != -1 and not (word[0] == '/' and word[1] == '/'): # contains '/' if to_delete == False: text_tokens.extend(word.split('/')) text_tokens.remove(word) to_delete = True if to_delete == False: if word in punctuations: i = text_tokens.index(word) text_tokens[i] = " " elif word == "http" or word == "https" or word == "http..." or word == "https..." or word == "RT" or word == "rt": i2 = text_tokens.index(word) text_tokens[i2] = " " elif len(word) > 1 and word[0] == '/' and word[1] == '/': i3 = text_tokens.index(word) text_tokens[i3] = " " else: text_tokens[index_count] = ''.join([i if ord(i) < 128 else '' for i in word]) index_count += 1 text_tokens[:] = [x for x in text_tokens if x != " " and x != ".." and x != "..." and x != "...." and x != "....." and x != "......" and x != "``" and x != "''" and x != "'s" and x != "'m" and x != "n't" and x != "." and x != "" and x != "'re" and x != "__" and x != "_" and x != "___" and x != "," and x != "!"]""" ############################################################################################## # find punctuations new_words = [] regex_pattern_for_num = '.*\d\.\d.*' regex_pattern_for_punctuation = 't.co.*|\'m|\'s|n\'t|\'re|\(|\)|\!|\-|\+|\[|\]|\{|\}|\;|\:|\'|\,|\<|\>|\?|\"|\^|\&|\*|\_|\~|\`|\||\=|\→|\/|\”|\“|\’|\—|\.|\``|\\\\|http.*|https.*|^RT$|^rt$' for word in text_tokens: # if term is a number in form ...d.d.. exp 230.3K - add to list if re.match(regex_pattern_for_num, word): new_words.append(word) continue # else - remove all punctuation from the term else: word = re.sub(regex_pattern_for_punctuation, '', word, flags=re.IGNORECASE) word = ''.join([i if ord(i) < 128 else '' for i in word]) if word == '' or word == ' ': continue new_words.append(word) text_tokens = new_words ############################################################################################## # find HASHTAGS # TODO: #whereIsKCR combined if "#" in text_tokens: index_list3 = [n for n, x in enumerate(text_tokens) if x == '#'] for index in index_list3: if index + 1 < len(text_tokens): if text_tokens[index + 1] != '#' and text_tokens[ index + 1][0] != '@' and text_tokens[index + 1].find( "#") == -1: #next word is not # and not @ if text_tokens[index + 1].find('_') == -1: # not contains '_' new_term = text_tokens[index] + text_tokens[index + 1] text_tokens.append(new_term) for sign in range( len(index_list3 )): # deletes all '#' and the word after it from list rmv_index = text_tokens.index('#') if rmv_index + 1 < len(text_tokens) and text_tokens[rmv_index + 1] != '#'\ and text_tokens[rmv_index + 1][0] != '@' and text_tokens[rmv_index + 1].find("#") == -1: word_val = text_tokens[rmv_index + 1] if not word_val.isupper() and not word_val.islower( ) and word_val.find('_') == -1: # split uppercase list_of_words = re.findall('[A-Z][^A-Z]*', word_val) for word in list_of_words: text_tokens.append(word) if word_val.find('_') != -1: # split '_' list_of_words = word_val.split('_') new_word = "#" for word in list_of_words: new_word += word text_tokens.append(word) # appends each word text_tokens.append(new_word) # appends #word if text_tokens[rmv_index + 1][0] != '@' and ( (not word_val.isupper() and not word_val.islower()) or word_val.islower() or (word_val.find('_') != -1)): #TODO: delete #fuck_you del text_tokens[rmv_index + 1] text_tokens.remove('#') ############################################################################################## # add fractions text_tokens.extend(fractions_list) ############################################################################################## # remove stop_words text_tokens_without_stopwords = [ w.lower() for w in text_tokens if w not in self.stop_words ] # print(text_tokens) # print(text_tokens_without_stopwords) ############################################################################################## # if stemmer to_stem = self.config.get__toStem() if to_stem: stem_text_tokens_without_stopwords = [] for token in text_tokens_without_stopwords: stem_token = Stemmer().stem_term(token) stem_text_tokens_without_stopwords.append(stem_token) #print(stem_text_tokens_without_stopwords) return stem_text_tokens_without_stopwords return text_tokens_without_stopwords
import codecs from plp import PLP from stemmer import Stemmer __author__ = 'maciej' plp = PLP() plp._init() ile_poprawnych = 0 ile_wszystkich = 0 s = Stemmer(plp, filename='trie.bak', word_type=None) f = codecs.open('test.txt', 'r', 'utf-8') for line in f: ile_wszystkich += 1 parts = line.split(',') b_form = s.find_basic_form(parts[0]) if b_form.basic_form.strip() == parts[1].strip(): ile_poprawnych += 1 else: print b_form.basic_form, ';', parts[1], ';', parts[0] print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych