def text_cleaner(text): text = text.lower() # приведение в lowercase, text = re.sub(r'https?://[\S]+', ' url ', text) # замена интернет ссылок text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text) text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', text) # замена даты и времени text = re.sub(r'\d+ ?гг?', ' date ', text) text = re.sub(r'\d+:\d+(:\d+)?', ' time ', text) # text = re.sub( r'@\w+', ' tname ', text ) # замена имён twiter # text = re.sub( r'#\w+', ' htag ', text ) # замена хештегов text = re.sub(r'<[^>]*>', ' ', text) # удаление html тагов text = re.sub(r'[\W]+', ' ', text) # удаление лишних символов stemmer = Stemmer('russian') text = ' '.join(stemmer.stemWords(text.split())) stw = [ 'в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то', 'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at' ] remove = r'\b(' + '|'.join(stw) + ')\b' text = re.sub(remove, ' ', text) text = re.sub(r'\b\w\b', ' ', text) # удаление отдельно стоящих букв text = re.sub(r'\b\d+\b', ' digit ', text) # замена цифр return text
def text_cleaner(text: str): text = text.lower() stemmer = Stemmer("russian") # Выбор языка на котором будут входные данные text = " ".join(stemmer.stemWords(text.split())) text = re.sub(r"\b\d+\b", "digit", text) # По идее заменяет цифры ! (на что пока не понял) return text
def classif(text, mass, num_all_docs, num_words_unic): stm = Stemmer('russian') text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*")) num_povt_words = 0 summa = 0 while_iter = 0 while while_iter < len(mass): summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1) for i in text: for i1 in mass[while_iter].lst_allword: if i == i1: num_povt_words = num_povt_words + 1 summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1) num_povt_words = 0 summa = summa + summand_2 mass[while_iter].c = summand_1 + summa summa = 0 while_iter = while_iter + 1 max_c = -100000 while_iter = 0 number_max = 0 while while_iter < len(mass): print mass[while_iter].c if mass[while_iter].c > max_c: max_c = mass[while_iter].c number_max = while_iter while_iter = while_iter + 1 print mass[number_max].name_categories
class MyStemmer(object): def __init__(self, stemmer_type): self.stemmer = Stemmer(stemmer_type) def do_stemming(self, word_list): return self.stemmer.stemWords(word_list)
class StemProvider(Provider): """Stem the input values (either a single word or a list of words) Uses the porter stemmer algorithm. """ def __init__(self, language='english', **kwargs): """ See here for a full list of languages: http://nltk.org/_modules/nltk/stem/snowball.html .. note:: This does not depend on nltk, it depends on the ``pystemmer`` package. :param language: language to use during stemming, defaults to english. """ Provider.__init__(self, **kwargs) self._stemmer = Stemmer(language) def do_process(self, input_value): if isinstance(input_value, str): return self._stemmer.stemWord(input_value) else: return self._stemmer.stemWords(input_value)
def _prepare_text(self, text): """Extracts and stems the words from some given text. """ words = re.findall('[a-z0-9\']+', text.lower()) words = [word for word in words if word not in STOP_WORDS] stemmer = Stemmer('english') stemmed_words = stemmer.stemWords(words) return stemmed_words
def train(name_file_dbase, way_to_dbase): stm = Stemmer('russian') file_base = open(name_file_dbase, 'r') Lines = file_base.readlines() num_all_docs = len(Lines) + 1 mass = [] iter1 = 0 iter2 = 0 for line in Lines: number1, address1 = unpack_line(line) number = number1.strip("\n") address = address1.strip("\n") if (number == "1"): mass.append(Categories()) mass[iter1].name_categories = address1 mass[iter1 - 1].num_docs = iter2 iter1 = iter1 + 1 iter2 = 0 iter2 = iter2 + 1 mass[len(mass) - 1].num_docs = iter2 while_iter = 0 file_base.close() number = 1 while while_iter < len(mass): while number <= mass[while_iter].num_docs: file_forclass = open(way_to_dbase + mass[while_iter].name_categories + '/' + str(number) + 'forclass.txt', 'r') str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read()) mass[while_iter].line_allword = mass[while_iter].line_allword + str_read file_forclass.close() number = number + 1 while_iter = while_iter + 1 number = 1 while_iter = 0 while while_iter < len(mass): forstemmer = mass[while_iter].line_allword.decode('UTF-8') str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*")) mass[while_iter].num_words = len(str_read) mass[while_iter].lst_allword = str_read lst_unic_words = list(set(mass[while_iter].lst_allword)) mass[while_iter].num_wordsunic = len(lst_unic_words) while_iter = while_iter + 1 all_words = 0 num_words_unic = 0 while_iter = 0 while while_iter < len(mass): all_words = all_words + mass[while_iter].num_words num_words_unic = num_words_unic + mass[while_iter].num_wordsunic while_iter = while_iter + 1 return mass, num_all_docs, num_words_unic
def get_search_phrases(self, indexing_func=None): """Returns search phrases from properties in a given Model instance. Args (optional): only_index: List of strings. Restricts indexing to these property names. indexing_func: A function that returns a set of keywords or phrases. Note that the indexing_func can be passed in to allow more customized search phrase generation. Two model variables influence the output of this method: INDEX_ONLY: If None, all indexable properties are indexed. If a list of property names, only those properties are indexed. INDEX_MULTI_WORD: Class variable that allows multi-word search phrases like "statue of liberty." INDEX_STEMMING: Returns stemmed phrases. """ if not indexing_func: klass = self.__class__ if klass.INDEX_MULTI_WORD: indexing_func = klass.get_search_phraseset else: indexing_func = klass.get_simple_search_phraseset if self.INDEX_STEMMING: stemmer = Stemmer('english') phrases = set() # allow indexing of 'subentities' such as tasks of a list as well queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES import logging for query, props in queries: entities = [] try: subentities = query(self).fetch(1000) # get all of them while len(subentities) > 0: entities.extend(subentities) last_key = subentities[-1].key() subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000) except TypeError, e: # query is not callable because it's an actual entity entities = [query] for entity in entities: for prop_name, prop_value in entity.properties().iteritems(): if not props or prop_name in props: values = prop_value.get_value_for_datastore(entity) if not isinstance(values, list): values = [values] if (isinstance(values[0], basestring) and not isinstance(values[0], datastore_types.Blob)): for value in values: words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS) if self.INDEX_STEMMING: stemmed_words = set(stemmer.stemWords(words)) phrases.update(stemmed_words) else: phrases.update(words)
def process_text(s): s = re.sub('<[^>]+>', '', s) s = re.sub('&.*?;', '', s) words = simple_preprocess(s, deacc=True, max_len=99) words = [word for word in words if word not in stoplist] stemmer = Stemmer('english') words = stemmer.stemWords(words) #print words #print stoplist #raw_input() return words
class PyStemmerMixIn(AbstractLanguage, metaclass=abc.ABCMeta): """Language which is supported by "PyStemmer" Python module.""" def __init__(self): """Constructor.""" super().__init__() # PyStemmer instance (lazy initialized) self.__pystemmer = None def stem_words(self, words: List[str]) -> List[str]: """Stem list of words with PyStemmer.""" language_code = self.language_code() words = decode_object_from_bytes_if_needed(words) # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence # tokenization first) words = [word.replace("’", "'") for word in words] if language_code is None: raise McLanguageException("Language code is None.") if words is None: raise McLanguageException("Words to stem is None.") # (Re-)initialize stemmer if needed if self.__pystemmer is None: try: self.__pystemmer = PyStemmer(language_code) except Exception as ex: raise McLanguageException( "Unable to initialize PyStemmer for language '%s': %s" % ( language_code, str(ex), )) stems = self.__pystemmer.stemWords(words) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
def stem_words(iterable, language='english'): """Stem every word in iterable. Uses PyStemmer which is based on the Porter Stemming algorithms - an algorithm for suffix stripping. https://tartarus.org/martin/PorterStemmer/def.txt :rtype: list. """ try: stemmer = Stemmer(language) except KeyError: stemmer = Stemmer('english') return stemmer.stemWords(iterable)
class PyStemmerMixIn(AbstractLanguage, metaclass=abc.ABCMeta): """Language which is supported by "PyStemmer" Python module.""" def __init__(self): """Constructor.""" super().__init__() # PyStemmer instance (lazy initialized) self.__pystemmer = None def stem_words(self, words: List[str]) -> List[str]: """Stem list of words with PyStemmer.""" language_code = self.language_code() words = decode_object_from_bytes_if_needed(words) # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence # tokenization first) words = [word.replace("’", "'") for word in words] if language_code is None: raise McLanguageException("Language code is None.") if words is None: raise McLanguageException("Words to stem is None.") # (Re-)initialize stemmer if needed if self.__pystemmer is None: try: self.__pystemmer = PyStemmer(language_code) except Exception as ex: raise McLanguageException( "Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),) ) stems = self.__pystemmer.stemWords(words) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
def __text_cleaner_with_stemming(raw_text): """ Using regexp to clean up the text Stemming the text :param raw_text: source text :return: clean text """ raw_text = raw_text.lower() # приведение в lowercase, raw_text = re.sub(r'https?://[\S]+', ' url ', raw_text) # замена интернет ссылок raw_text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', raw_text) raw_text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', raw_text) # замена даты и времени raw_text = re.sub(r'\d+ ?гг?', ' date ', raw_text) raw_text = re.sub(r'\d+:\d+(:\d+)?', ' time ', raw_text) raw_text = re.sub(r'@\w+', ' tname ', raw_text) # замена имён twiter raw_text = re.sub(r'#\w+', ' htag ', raw_text) # замена хештегов raw_text = re.sub(r'<[^>]*>', ' ', raw_text) # удаление html тагов raw_text = re.sub(r'[\W]+', ' ', raw_text) # удаление лишних символов stemmer = Stemmer('russian') raw_text = ' '.join(stemmer.stemWords(raw_text.split())) stw = ['в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то', 'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at'] remove = r'\b(' + '|'.join(stw) + ')\b' raw_text = re.sub(remove, ' ', raw_text) raw_text = re.sub(r'\b\w\b', ' ', raw_text) # удаление отдельно стоящих букв raw_text = re.sub(r'\b\d+\b', ' digit ', raw_text) # замена цифр return raw_text
def text_cleaner(text): text = text.lower() # приведение в lowercase stemmer = Stemmer('russian') text = ' '.join( stemmer.stemWords( text.split() ) ) text = re.sub( r'\b\d+\b', ' digit ', text ) # замена цифр return text
class CleanTextUtil: """ Utility for cleaning text by using stop words and stemming. Examples: >>> c = CleanTextUtil("french") >>> c.stem_words([u"Nous", u"allions", u"à", u"la", u"plage"]) [u'Nous', u'allion', u'à', u'la', u'plag'] >>> c.rm_stop_words([u"Nous", u"allions", u"à", u"la", u"plage"]) [u'Nous', u'allions', u'plage'] >>> c.clean_text(u"Nous allions à la plage") [u'allion', u'plag'] Attributes: stemmer (Stemmer.Stemmer): The stemmer delegate object. stopwords (list of str): A list of stopwords. """ def __init__(self, language): """ Initializes attributes with the language provided. Args: language (str): The language used to stem ('french', 'english'). """ self.stemmer = Stemmer(language) self.stopwords = stopwords.words(language) def stem_words(self, words): """ Stems a list of words. Args: words (list of str): A list of words. Returns: list of str: The list updated with stem words. """ return self.stemmer.stemWords(words) def rm_stop_words(self, words): """ Removes stop words from a list of words. Args: words (list of str): A list of words. Returns: list of str: The list minus the stop words. """ return [word for word in words if word.lower() not in self.stopwords] def clean_text(self, text): """ Cleans a text to optimize search engines. Step of the cleaning: 1. Transform all characters to lowercase letters. 2. Find all word with the regular expression "\w+". 3. Remove stop words with a filter. 4. Stem the rest of words. Args: text (str): A text. Returns: list of str: The list of words transformed. """ words = SPLIT_TEXT.findall(text.lower()) words = self.rm_stop_words(words) words = self.stem_words(words) return words
#!/usr/bin/python # -*- coding: UTF-8 -*- import os import sys from Stemmer import Stemmer from nltk import regexp_tokenize directory = sys.argv[1]; files = os.listdir(directory); text_file = filter(lambda x: x.endswith('.txt'), files); all_text = "" for i in text_file: try: f = open(i, 'r') all_text = all_text + f.read() except: print i stm = Stemmer('russian') text = stm.stemWords(regexp_tokenize((all_text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*")) for i in text: num = text.count(i) print i.encode('UTF - 8'), " ", num
def full_text_search(phrase, limit=10, kind=None, stemming=INDEX_STEMMING, multi_word_literal=INDEX_MULTI_WORD, add_stop_words=frozenset([])): """Queries search indices for phrases using a merge-join. Args: phrase: String. Search phrase. kind: String. Returned keys/entities are restricted to this kind. Returns: A list of (key, title) tuples corresponding to the indexed entities. Multi-word literal matches are returned first. TODO -- Should provide feedback if input search phrase has stop words, etc. """ index_keys = [] keywords = unidecode(PUNCTUATION_REGEX.sub(' ', phrase)).lower().split() if stemming: stemmer = Stemmer('english') klass = StemmedIndex else: klass = LiteralIndex #logging.warning(keywords) current_user = users.get_current_user() if len(keywords) > 1 and multi_word_literal: # Try to match literal multi-word phrases first if len(keywords) == 2: search_phrases = [' '.join(keywords)] else: search_phrases = [] sub_strings = len(keywords) - 2 keyword_not_stop_word = map( lambda x: x not in STOP_WORDS and x not in add_stop_words, keywords) for pos in xrange(0, sub_strings): if keyword_not_stop_word[pos] and keyword_not_stop_word[ pos + 2]: search_phrases.append(' '.join(keywords[pos:pos + 3])) for phrase in search_phrases: #logging.warning(phrase) if stemming: phrase = ' '.join(stemmer.stemWords(phrase.split())) if current_user: query = klass.all(keys_only=True).filter( 'phrases =', phrase).filter('view_permissions =', current_user.user_id()).order( 'ordinal') #.order('-rating') pub_query = klass.all(keys_only=True).filter( 'phrases =', phrase).filter('view_permissions =', 'public').order( 'ordinal') #.order('-rating') if kind: if current_user: query = query.filter('parent_kind =', kind) pub_query = pub_query.filter('parent_kind =', kind) if current_user: index_keys.extend([ key for key in query.fetch(limit=limit - len(index_keys)) if key not in index_keys ]) index_keys.extend([ key for key in pub_query.fetch(limit=limit - len(index_keys)) if key not in index_keys ]) if len(index_keys) < limit: new_limit = limit - len(index_keys) keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords) if stemming: keywords = stemmer.stemWords(keywords) for keyword in keywords: if current_user: query = klass.all(keys_only=True).filter( 'phrases =', keyword).filter('view_permissions =', current_user.user_id()).order( 'ordinal') #.order('-rating') pub_query = klass.all(keys_only=True).filter( 'phrases =', keyword).filter('view_permissions =', 'public').order( 'ordinal') #.order('-rating') if kind: if current_user: query = query.filter('parent_kind =', kind) pub_query = pub_query.filter('parent_kind =', kind) if current_user: index_keys.extend([ key for key in query.fetch(limit=limit - len(index_keys)) if key not in index_keys ]) index_keys.extend([ key for key in pub_query.fetch(limit=limit - len(index_keys)) if key not in index_keys ]) return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys]
def get_search_phrases(self, indexing_func=None): """Returns search phrases from properties in a given Model instance. Args (optional): only_index: List of strings. Restricts indexing to these property names. indexing_func: A function that returns a set of keywords or phrases. Note that the indexing_func can be passed in to allow more customized search phrase generation. Two model variables influence the output of this method: INDEX_ONLY: If None, all indexable properties are indexed. If a list of property names, only those properties are indexed. INDEX_MULTI_WORD: Class variable that allows multi-word search phrases like "statue of liberty." INDEX_STEMMING: Returns stemmed phrases. """ if not indexing_func: klass = self.__class__ if klass.INDEX_MULTI_WORD: indexing_func = klass.get_search_phraseset else: indexing_func = klass.get_simple_search_phraseset if self.INDEX_STEMMING: stemmer = Stemmer('english') phrases = set() # allow indexing of 'subentities' such as tasks of a list as well queries = [(self, self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES import logging for query, props in queries: entities = [] try: subentities = query(self).fetch(1000) # get all of them while len(subentities) > 0: entities.extend(subentities) last_key = subentities[-1].key() subentities = query(self).order('__key__').filter( '__key__ >', last_key).fetch(1000) except TypeError, e: # query is not callable because it's an actual entity entities = [query] for entity in entities: for prop_name, prop_value in entity.properties().iteritems(): if not props or prop_name in props: values = prop_value.get_value_for_datastore(entity) if not isinstance(values, list): values = [values] if (isinstance(values[0], basestring) and not isinstance(values[0], datastore_types.Blob)): for value in values: words = indexing_func( value, add_stop_words=self.INDEX_ADD_STOP_WORDS) if self.INDEX_STEMMING: stemmed_words = set( stemmer.stemWords(words)) phrases.update(stemmed_words) else: phrases.update(words)
#!/usr/bin/python # -*- coding: UTF-8 -*- import os import sys from Stemmer import Stemmer from nltk import regexp_tokenize directory = sys.argv[1] files = os.listdir(directory) text_file = filter(lambda x: x.endswith('.txt'), files) all_text = "" for i in text_file: try: f = open(i, 'r') all_text = all_text + f.read() except: print i stm = Stemmer('russian') text = stm.stemWords( regexp_tokenize((all_text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*")) for i in text: num = text.count(i) print i.encode('UTF - 8'), " ", num
def full_text_search(phrase, limit=10, kind=None, stemming=INDEX_STEMMING, multi_word_literal=INDEX_MULTI_WORD, add_stop_words=frozenset([])): """Queries search indices for phrases using a merge-join. Args: phrase: String. Search phrase. kind: String. Returned keys/entities are restricted to this kind. Returns: A list of (key, title) tuples corresponding to the indexed entities. Multi-word literal matches are returned first. TODO -- Should provide feedback if input search phrase has stop words, etc. """ index_keys = [] keywords = unidecode(PUNCTUATION_REGEX.sub(' ', phrase)).lower().split() if stemming: stemmer = Stemmer('english') klass = StemmedIndex else: klass = LiteralIndex #logging.warning(keywords) current_user = users.get_current_user() if len(keywords) > 1 and multi_word_literal: # Try to match literal multi-word phrases first if len(keywords) == 2: search_phrases = [' '.join(keywords)] else: search_phrases = [] sub_strings = len(keywords) - 2 keyword_not_stop_word = map(lambda x: x not in STOP_WORDS and x not in add_stop_words, keywords) for pos in xrange(0, sub_strings): if keyword_not_stop_word[pos] and keyword_not_stop_word[pos+2]: search_phrases.append(' '.join(keywords[pos:pos+3])) for phrase in search_phrases: #logging.warning(phrase) if stemming: phrase = ' '.join(stemmer.stemWords(phrase.split())) if current_user: query = klass.all(keys_only=True).filter('phrases =', phrase).filter('view_permissions =',current_user.user_id()).order('ordinal') #.order('-rating') pub_query = klass.all(keys_only=True).filter('phrases =', phrase).filter('view_permissions =','public').order('ordinal') #.order('-rating') if kind: if current_user: query = query.filter('parent_kind =', kind) pub_query = pub_query.filter('parent_kind =', kind) if current_user: index_keys.extend([key for key in query.fetch(limit=limit-len(index_keys)) if key not in index_keys]) index_keys.extend([key for key in pub_query.fetch(limit=limit-len(index_keys)) if key not in index_keys]) if len(index_keys) < limit: new_limit = limit - len(index_keys) keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords) if stemming: keywords = stemmer.stemWords(keywords) for keyword in keywords: if current_user: query = klass.all(keys_only=True).filter('phrases =', keyword).filter('view_permissions =',current_user.user_id()).order('ordinal') #.order('-rating') pub_query = klass.all(keys_only=True).filter('phrases =', keyword).filter('view_permissions =','public').order('ordinal') #.order('-rating') if kind: if current_user: query = query.filter('parent_kind =', kind) pub_query = pub_query.filter('parent_kind =', kind) if current_user: index_keys.extend([key for key in query.fetch(limit=limit-len(index_keys)) if key not in index_keys]) index_keys.extend([key for key in pub_query.fetch(limit=limit-len(index_keys)) if key not in index_keys]) return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys]
#getting external links extl = find_between(text, "xternal links==", "\n\n") text = text.replace(extl, '') #getting references ref = find_between(text, "eferences==", "==") + find_between(text, "eferences ==", "==") text = text.replace(ref, '') #clearing up the dictionary, and working on each field article_dict = {} #TITLE field_tokens = [] title = re.sub('[^A-Za-z]', ' ', title) chunk = nltk.word_tokenize(title.lower()) stopped_tokens = [i for i in chunk if not i in stop_words] field_tokens = p_stemmer.stemWords(stopped_tokens) for i in field_tokens: if i in article_dict.keys(): freq = int(find_between(article_dict[i], "(",")")) + 1 if "T" in article_dict[i]: article_dict[i] = find_between(article_dict[i], "", "(") + "(%d)" % freq else: article_dict[i] = "T" + find_between(article_dict[i], "", "(") + "(%d)" % freq else: article_dict[i] = "T%d(1)" % count #BODY TEXT field_tokens = [] text = re.sub('[^A-Za-z]', ' ', text) chunk = nltk.word_tokenize(text.lower()) stopped_tokens = [i for i in chunk if not i in stop_words]
def text_cleaner(text): text = text.lower() stemmer = Stemmer('russian') text = ' '.join(stemmer.stemWords(text.split())) text = re.sub(r'\b\d+\b', ' digit ', text) return text