def preprocess_document(data): # Step 1: strip punctuation data = data.lower() punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']' , '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-'] for punc in punctuation: data = data.replace(punc, '') # Step 2: tokenize data = list(nltk.word_tokenize(data)) # Step 3: strip stopwords stop = set(stopwords.words('english')) extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here stop.update(extra_stopwords) stop.update(list(string.ascii_lowercase)) # remove all single letters data = [i for i in data if i not in stop] # remove stopwords and sort result # Step 4: stemming stemmer = snowballstemmer.stemmer('english') data = stemmer.stemWords(data) # Step 5: remove words not in NLTK english corpus words = set(nltk.corpus.words.words()) for w in data: if w not in words: data.remove(w)
def __init__(self, german): path = os.path.dirname(os.path.abspath(__file__)) print(path) self.IX = open_dir(path + "/index") self.Writer = self.IX.writer() if german == True: self.Stemmer = snowballstemmer.stemmer('german') else: self.Stemmer = snowballstemmer.stemmer('french')
def cut2list(self, string): """ 返回list :param string: :return: """ tokens = [] if self.replaceP == True: sens = split(string, '' if self.type == Analyzer.ANALYZERS.Jieba else ' ') else: sens = [string] #[strB2Q(string)] for sen in sens: if self.type == Analyzer.ANALYZERS.Jieba: # 使用jieba进行分词 words = self.analyzer.cut(sen, cut_all=False) elif self.type == Analyzer.ANALYZERS.nltk: #使用英文进行分词 sen = sen.lower() words = self.analyzer.word_tokenize(sen) stemmer = snowballstemmer.stemmer('english') # 参数是选择的语言 words = stemmer.stemWords(words) if self.useStopwords == True: for word in words: if word not in stopwords and len(word.strip()) > 0: tokens.append(word) else: tokens += words return tokens
def getHighlightingsVariables(self, article, variable_keywords, variable_pages): stemmer = snowballstemmer.stemmer("german") #goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'" for i in range(0, len(article)): for j in range(0, len(article[i])): article[i][j] = article[i][j].split(" "); for k in range(0, len(article[i][j])): #article[i][j][k]=chrtran(article[i][j][k], goodchars, "") article[i][j][k]=stemmer.stemWord(article[i][j][k]) for i in range(0, len(variable_keywords)): #variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "") variable_keywords[i]=stemmer.stemWord(variable_keywords[i]) highlight = [] for i in range(0, len(article)): highlight_article = [] for j in range(0, len(article[i])): highlight_variables = [] for k in range(0, len(variable_keywords)): highlight_variables.append(random.random()) highlight_article.append(highlight_variables) highlight.append(highlight_article) return highlight
def clean(text, stemmer='snowball'): """Normalize, split, and clean text Parameters: ----------- text : str Block of text to clean and prepare. stemmer : str, opt Stemmer to use: [snowball, five, simple] Returns: -------- text : str Cleaned and prepared text block. """ if not stemmer in ['snowball', 'five', 'simple', 'none']: raise ValueError("Stemmer choice not available.") text = re.sub("[{}]".format(string.punctuation), " ", text.lower()) text = text.split() if stemmer == 'five': text = [five_stemmer(item) for item in text] elif stemmer == 'snowball': stemmer = snowballstemmer.stemmer('english'); text = stemmer.stemWords(text) elif stemmer == 'simple': text = [simple_stem(item) for item in text] else: pass text = [item for item in text if not item in STOP_WORDS] return text
def seeker_highlight(text, query, algorithm='english'): if not query: return mark_safe(seeker_format(text)) try: import snowballstemmer stemmer = snowballstemmer.stemmer(algorithm) stemWord = stemmer.stemWord stemWords = stemmer.stemWords except: stemWord = lambda word: word stemWords = lambda words: words phrases = _phrase_re.findall(query) keywords_q = [ w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w ] highlight = set(stemWords(keywords_q)) text = seeker_format(text) for phrase in phrases: text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I) parts = [] for word in re.split(r'(\W+)', text): if stemWord(word.lower()) in highlight: parts.append('<em>%s</em>' % word) else: parts.append(word) return mark_safe(''.join(parts))
def cut(self, string): """ 用分词器切词并用空格隔开 :param string: :return: 返回格式是字符串 """ article_contents = '' sens = '' if self.replaceP == True: sens = split(string) else: sens = [string] #strB2Q(string) for sen in sens: if self.type == Analyzer.ANALYZERS.Jieba: # 使用jieba进行分词 words = self.analyzer.cut(sen, cut_all=False) elif self.type == Analyzer.ANALYZERS.nltk: #使用英文进行分词 sen = sen.lower() words = self.analyzer.word_tokenize(sen) stemmer = snowballstemmer.stemmer('english') # 参数是选择的语言 words = stemmer.stemWords(words) if self.useStopwords == True: for word in words: if word not in stopwords and len(word.strip()) > 0: article_contents += word + " " else: article_contents = ' '.join(words) return article_contents
def __init__(self, language="es"): """ Init method :param language: input language """ self.__stemmer = snowballstemmer.stemmer("spanish") Token.set_extension("stem", default="", force=True)
def do_work(*args): import snowballstemmer stemmer = snowballstemmer.stemmer('english') print(js.data.textdata) txt = js.data.textdata newval = stemmer.stemWords(txt.split()) return newval
def stemming(lang, input, output, encoding, pretty): result = [] stemmer = snowballstemmer.stemmer(lang) for original in codecs.open(input, "r", encoding).readlines(): original = original.strip() # Convert only ASCII-letters to lowercase, to match C behavior original = ''.join( (lower_(c) if 'A' <= c <= 'Z' else c for c in original)) stemmed = stemmer.stemWord(original) if result: result.append('\n') if pretty == 0: if stemmed != "": result.append(stemmed) elif pretty == 1: result.append(original, " -> ", stemmed) elif pretty == 2: result.append(original) if len(original) < 30: result.append(" " * (30 - len(original))) else: result.append("\n") result.append(" " * 30) result.append(stemmed) outfile = codecs.open(output, "w", encoding) outfile.write(''.join(result) + '\n') outfile.close()
def do_semantic_analysis(sentence): sentence_probability_of_negative = 1 sentence_probability_of_positive = 1 stem = stemmer('turkish') stopwords_file = open("cookit.pythonanywhere.com/comments/text_files/stopwords.txt", "r").read() stopwords_list = stopwords_file.split("\n") words_list = sentence.split(" ") for word in words_list: word = re.sub(r'[^\w\s]', '', word) word = word.lower() x = [word] word = stem.stemWords(x)[0] if word in stopwords_list: continue else: try: word_probability_of_negative = ProbabilityOfWords.objects.get(word=word).probabilityOfNegative except ProbabilityOfWords.DoesNotExist: word_probability_of_negative = 1 try: word_probability_of_positive = ProbabilityOfWords.objects.get(word=word).probabilityOfPositive except ProbabilityOfWords.DoesNotExist: word_probability_of_positive = 1 sentence_probability_of_negative *= word_probability_of_negative sentence_probability_of_positive *= word_probability_of_positive if sentence_probability_of_positive > sentence_probability_of_negative: result = "positive" elif sentence_probability_of_positive < sentence_probability_of_negative: result = "negative" else: result = "notr" return result
def my_separate_samples(read_input_lines, stem_flag): input_splitted_list = [] input_class_list = [] if stem_flag == '1': print('stemmer') my_stemmer = sb.stemmer('turkish') for curr_line in read_input_lines: curr_line2 = curr_line.lower() exclude = string.punctuation curr_line3 = ''.join(ch for ch in curr_line2 if ch not in exclude) curr_line4 = curr_line3.split('\t') curr_sample = curr_line4[0].split() curr_sample = list(set(curr_sample)) curr_class = curr_line4[1].replace('\n', '') if stem_flag == '1': stemmed_curr_sample = [] for wt in curr_sample: if len(wt) > 5: stemmed_curr_sample.append(my_stemmer.stemWord(wt)) else: stemmed_curr_sample.append(wt) curr_sample = stemmed_curr_sample input_splitted_list.append(curr_sample) input_class_list.append(curr_class) return input_splitted_list, input_class_list
def __init__(self) -> None: warnings.warn( f"{self.__class__.__name__} is deprecated, use " "snowballstemmer.stemmer('porter') instead.", RemovedInSphinx70Warning, stacklevel=2) self.stemmer = snowballstemmer.stemmer('porter')
def WordTabLemma(fin, fout): '''Convert one word per line format to word-tab-lemma per line format.''' stemmer = snowballstemmer.stemmer('english') with open(fin, 'rt') as fi, open(fout, 'wt') as fo: for word in fi: word = word.strip() fo.write("{}\t{}\n".format(word, stemmer.stemWord(word)))
def __name_follows(self, token): """split the token based on letters start with Args: token (str): a word Returns: list: splited word """ follows = [ '\u0628', # ب '\u0643', # ك '\u0644', # ل '\u0648', # و '\u062a', # ت '\u0633' ] stem = stemmer("arabic").stemWord(token) for follow in follows: if token.startswith(follow) and not stem.startswith(follow): token = re.sub(follow, r'\g<0><SPLIT>', token, flags=re.UNICODE) return token.split("<SPLIT>")
def stemmer(pList): stemmer = snowballstemmer.stemmer('spanish') stemmedWords = set([]) for word in pList: stemmed = stemmer.stemWord(word) stemmedWords.add(word) return stemmedWords
def stemming(lang, input, output, encoding, pretty): result = [] stemmer = snowballstemmer.stemmer(lang) for original in codecs.open(input, "r", encoding).readlines(): original = original.strip() # Convert only ASCII-letters to lowercase, to match C behavior original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original)) stemmed = stemmer.stemWord(original) if result: result.append('\n') if pretty == 0: if stemmed != "": result.append(stemmed) elif pretty == 1: result.append(original, " -> ", stemmed) elif pretty == 2: result.append(original) if len(original) < 30: result.append(" " * (30 - len(original))) else: result.append("\n") result.append(" " * 30) result.append(stemmed) outfile = codecs.open(output, "w", encoding) outfile.write(''.join(result) + '\n') outfile.close()
def init(self, options: Dict) -> None: if JIEBA: dict_path = options.get('dict') if dict_path and os.path.isfile(dict_path): jieba.load_userdict(dict_path) self.stemmer = snowballstemmer.stemmer('english')
def clean(text, stemmer='snowball'): """Normalize, split, and clean text Parameters: ----------- text : str Block of text to clean and prepare. stemmer : str, opt Stemmer to use: [snowball, five, simple] Returns: -------- text : str Cleaned and prepared text block. """ if not stemmer in ['snowball', 'five', 'simple', 'none']: raise ValueError("Stemmer choice not available.") text = re.sub("[{}]".format(string.punctuation), " ", text.lower()) text = text.split() if stemmer == 'five': text = [five_stemmer(item) for item in text] elif stemmer == 'snowball': stemmer = snowballstemmer.stemmer('english') text = stemmer.stemWords(text) elif stemmer == 'simple': text = [simple_stem(item) for item in text] else: pass text = [item for item in text if not item in STOP_WORDS] return text
def __init__(self, language=None): """Create a new highlighter for the specified language. """ if language: self.stem = snowballstemmer.stemmer(language) else: self.stem = NoStem()
def gen_words(text, stemming=stem.stemmer('english')): """Create generator. :param text: some string :param stemming: variant of stemming algorithm :return: generator giving stemmed words from text """ for word in stemming.stemWords(re.findall(r"[\w']+", text.lower())): yield word
def turkish(sent): # No turkish stemmer in NLTK stem = snowballstemmer.stemmer('turkish') stop = stopwords.words('turkish') tx = word_tokenize(sent) mx = stem.stemWords(tx) px = [x for x in mx if x not in stop] return px
def __init__(self, N=8): """ Create the object :param int N: max length of the suffix used in the rules """ self.N = N self._rules = [None] * (N + 1) self._stemmer = sbs.stemmer('italian')
def stem2(in_vec): stemmer = snowballstemmer.stemmer('english') out_vec = [] for x in in_vec: to_out = stemmer.stemWord(x) if(len(to_out) > 2): out_vec.append(to_out) return out_vec
def get_feature_base(sentence): stemmer = snowballstemmer.stemmer('english') words = sentence.split() result = [] for word in words: if is_stopword(word): continue result.append(stemmer.stemWord(word)) return ' '.join(result)
def get_stemmer(language): stemmer_languages = [ "danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "romanian", "russian", "spanish", "swedish", "turkish" ] if language.lower() in stemmer_languages: return lambda word: snowballstemmer.stemmer(language).stemWord(word) return lambda word: word
def stem_and_lower(str_): """ Returns string with unique lowercase words stemmed. """ stemmer = snowballstemmer.stemmer(config.LANGUAGE_FULL) str_no_punctuation = str_.translate(REMOVE_PUNCTUATION_MAP) str_stemmed = stemmer.stemWords( map(lambda x: x.lower(), set(str_no_punctuation.split()))) return ' '.join(str_stemmed)
def aplicarStemmer(pDictPalabrasArchivos): print("aplicando stemming...") dictRaices = {} stemmer = snowballstemmer.stemmer("spanish") for docId, palabras in pDictPalabrasArchivos.items(): raices = stemmer.stemWords(palabras) dictRaices[docId] = raices ## archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices) return dictRaices
def rootsoftheliturgical(words): words=words.lower() rootfind = stemmer('turkish') trans=str.maketrans('', '', punctuation) words = words.translate(trans) words=StopWords(words) # letters = words.split() letters = rootfind.stemWords(words) string =' '.join(letters) return string
def __init__(self, xml): self.dest = xml.get("dest") if self.dest is None: raise ValueError() self.verbose = xml.get("verbose") if self.verbose is None: self.verbose = False else: self.verbose = True self.stemmer = snowballstemmer.stemmer('english')
def main(): stemmer = snowballstemmer.stemmer('english') # stemmingモジュール読み込み for i, line in enumerate(sentence_extraction()): if i == 10: break words = line.strip('\n').split(' ') for word in words: # 各単語に対して、stemmer.stemWord(word)でステミング処理 print('{}\t{}'.format(word, stemmer.stemWord(word))) print('\n')
def snowball_tokenfilter(token): """ Snowball token filter uses the Snowball stemming library collection for python: https://github.com/shibukawa/snowball_py """ stemmer = snowballstemmer.stemmer("english") token["token"] = stemmer.stemWord(token["token"]) return token
def __init__(self, **kwargs): self.basic_params = kwargs['basic'] self.emb_params = kwargs['embeddings'] self.sim_params = kwargs['similarity'] self.subj_params = kwargs['subjectivity'] self.sent_params = kwargs['sentiment'] self.emo_params = kwargs['emotion'] self.nlp = spacy.load(self.basic_params['model']) self.stemmer = stemmer('greek') self.lexicon_ = None
def stem2(word): stemmer = snowballstemmer.stemmer("turkish") stemmed = stemmer.stemWord(word) if stemmed == "fatur": stemmed = "fatura" elif stemmed == "hatt": stemmed = "hat" return stemmed
def clean_text_stemmed(t): """Accepts a Document """ t = t.lower() # Remove single characters t = re.sub("[^A-Za-z0-9]", " ", t) # Replace all numbers by a single char t = re.sub("[0-9]+", "#", t) stemmer = snowballstemmer.stemmer('english') tfinal = " ".join(stemmer.stemWords(t.split())) return t
def text_cleaner(text): stemmer = snowballstemmer.stemmer('russian') text = text.lower() # приведение в lowercase, text = re.sub(r'https?://[\S]+', ' url ', text) # замена интернет ссылок text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text) text = re.sub(r'<[^>]*>', ' ', text) # удаление html тагов text = re.sub(r'[\W\n]+', ' ', text) # удаление лишних символов text = re.sub(r'\w*\d\w*', '', text) # замена цифр text = re.sub(r'\w*[.]\w*', '', text) # замена цифр text = ' '.join(stemmer.stemWords(text.split())) # Выделение корней return text
def aplicarStemmerConsulta(pLista): #print(pLista) print("aplicando stemming...") lista = [] stemmer = snowballstemmer.stemmer('spanish') for i in pLista: #print(i[0]) raiz = stemmer.stemWords([i[0]])[0] lista.append([raiz,i[1]]) #print(i[0]) #print(lista) return lista
def create_search_terms(string_terms): ''' Creates search terms by stemming every word within the parameter passed. Returns all search terms in one string separated by space''' stemmer = snowballstemmer.stemmer('english') terms = stemmer.stemWords(string_terms.split()) search_term = list() for term in terms: lower_term = term.lower() if not lower_term in _STOP_WORDS: search_term.append(lower_term) return " ".join(search_term)
def __init__(self, samples=None, stopwords="english", limit=20, logging=False): """ Create a vocabulary which is a mapping from bucket names to lists of synonyms that fall into their bucket. Stopwords is a list of words that are ignored for the vocabulary and defaults to a built-in english stopword list. """ self.stopwords = stopwords self.stemmer = snowballstemmer.stemmer("english") self.tokens = re.compile(r"[A-Z]?[a-z]{2,}") self.logging = logging if samples: self._generate_vocabulary(samples, limit)
def search_result(request): query = request.POST.get('query') q_words = query.split() stemmed_words = [] for word in q_words: lng = detect(word) if lng in LANGUAGES: lng = LANGUAGES[lng] stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word)) else: stemmed_words.append(word) return render(request, 'searchres/search_result.html', {})
def getPalabras(): file = "dicc.txt" arc = open(file, 'r') stemmer = snowballstemmer.stemmer('spanish'); words = {} for i in arc: i = i.rstrip() i = stemmer.stemWord(i) words[i] = "word" for i in words.items(): print i print len(words)
def get_coursed_and_create_matrix(): results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"] new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses() print new_matrix.matrix.shape[0] != len(results) if new_matrix.matrix.shape[0] != len(results): all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results] MatrixEdxCoursesId.objects.all().delete() map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results) stemmer = snowballstemmer.stemmer("english") courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses] vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32) matrix = vect.fit_transform(courses_stem) new_matrix.matrix = matrix new_matrix.save()
def identify_language(self, text): self.lang = lang_mapping[langid.classify(text)[0]] if self.debug: print "LANG", self.lang#, "stemmer", self.stem if self.lang == "greek": from stemmers.greek import stem, stopwords self.stem = stem self.legal_token = partial(self.legal_token, exclude_list=stopwords) elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk import snowballstemmer from stemmers.turkish import stopwords self.stem = snowballstemmer.stemmer("turkish").stemWord self.legal_token = partial(self.legal_token, exclude_list=stopwords) else: from nltk.stem import SnowballStemmer from nltk.corpus import stopwords self.stem = SnowballStemmer(self.lang).stem self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if 'title' not in o.json.keys(): if verbose: print('No title in', o.getKey()) return 1 # no title # check for a different language - to avoid stemming altogether if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags): if 'stemmed' in o.json.keys(): # if stemmed before marked foreign, remove this info del o.json['stemmed'] F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0 changed = False ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles stemmer = snowballstemmer.stemmer('english').stemWords ### disregarded variant: snowballstemmer porter - considered outdated # stemmer = snowballstemmer.stemmer('porter').stemWords ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs] ### disregarded variant: nltk - worse on verbs ending with -ze # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs] ### end variants stemmed = stemmer(string2words(o.get('title'))) if '' in stemmed: print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey()))) print(string2words(o.get('title'))) print(stemmer(string2words(o.get('title')))) ALLSTEMS.update(stemmed) if o.get('stemmed') != stemmed: o.json['stemmed'] = stemmed changed = True if changed: F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0
def main(): argv = sys.argv if len(argv) < 2: usage() return algorithm = 'english' if len(argv) > 2: algorithm = argv[1] argv = argv[2:] else: argv = argv[1:] stemmer = snowballstemmer.stemmer(algorithm) splitter = re.compile(r"[\s\.-]") for arg in argv: for word in splitter.split(arg): if word == '': continue original = word.lower() print(original + " -> " + stemmer.stemWord(original))
def preprocess_features(dataframe): # get the count of how many times each product appears, may correlate product_counts = pandas.DataFrame(pandas.Series(dataframe.groupby(["product_uid"]).size(), name="product_count")) dataframe = pandas.merge(dataframe, product_counts, left_on="product_uid", right_index=True, how="left") dataframe = experiment_gensim(dataframe) dataframe["search_length"] = dataframe.search_term.str.len() dataframe["id_bins"] = pandas.cut(dataframe.id, 20, labels=False) # word distribution metrics dataframe["title_unigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1), axis=1) dataframe["title_bigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2), axis=1) dataframe["desc_unigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1), axis=1) dataframe["desc_bigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2), axis=1) dataframe["brand_unigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(1), axis=1) dataframe["brand_bigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(2), axis=1) # stemmed unigrams stemmer = snowballstemmer.stemmer("english") dataframe["title_unigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1) dataframe["desc_unigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1) dataframe["title_bigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1) dataframe["desc_bigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1) # edit distance metrics (slow) dataframe["title_word_edit_distance"] = dataframe[["search_term", "product_title"]].apply(word_edit_distance, axis=1) dataframe["title_char_edit_distance"] = dataframe[["search_term", "product_title"]].apply(char_edit_distance, axis=1) # dataframe["desc_word_edit_distance"] = dataframe[["search_term", "product_description"]].apply(word_edit_distance, axis=1) # dataframe["desc_char_edit_distance"] = dataframe[["search_term", "product_description"]].apply(char_edit_distance, axis=1) dataframe = dataframe.drop(["product_title", "search_term", "id", "product_description", "brand_name"], axis=1) print(dataframe.describe()) return dataframe
def textrank(text, hdr): sent_tokenizer = PunktSentenceTokenizer() sentences = sent_tokenizer.tokenize(text) word_tokenizer = RegexpTokenizer(r'\w+') # finding out the most possible language of the text lang_code = lang_identifier.classify(' '.join([hdr, text]))[0] stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english')) words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower())) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True), lang_code
def seeker_highlight(text, query, algorithm='english'): try: import snowballstemmer stemmer = snowballstemmer.stemmer(algorithm) stemWord = stemmer.stemWord stemWords = stemmer.stemWords except: stemWord = lambda word: word stemWords = lambda words: words phrases = _phrase_re.findall(query) keywords = [w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w] highlight = set(stemWords(keywords)) text = seeker_format(text) for phrase in phrases: text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I) parts = [] for word in re.split(r'(\W+)', text): if stemWord(word.lower()) in highlight: parts.append('<em>%s</em>' % word) else: parts.append(word) return ''.join(parts)
def score_words_in_sentence(l_sentence, best_stops): """ Define a score of relevance for each word in tweet We consider only non stop words, stemmed words either for tweet and stop name :param l_sentence: tweet split in list of words :param best_stops: list of more relevant stops :return: 1. list of kept/stemmed words found in tweet 2. list of scores for each of these words """ sb_stemmer = stemmer('french') stemmed_sentence = [sb_stemmer.stemWords([x])[0] for x in l_sentence] scores_stops = [] tag_words = [0 for _ in l_sentence] relevant_stops = map(lambda x: unicodedata.normalize('NFD', x[0][0]).encode('ascii', 'ignore'), best_stops) rg_stop = 1 for stop in relevant_stops: stop_w_index = [] stop_lw = re.findall("\w+", stop, re.UNICODE) for w_stop in stop_lw: if not w_stop in fr_stop_words: stemmed_w = sb_stemmer.stemWords([w_stop])[0] if stemmed_w in stemmed_sentence: stop_w_index.append(stemmed_sentence.index(stemmed_w)) score_w = 0 for i in range(len(stop_w_index)): if (i > 0) and (stop_w_index[i] <= stop_w_index[i-1]): score_w = 0 break else: score_w += 1 scores_stops.append(score_w) if score_w != 0: for idx in stop_w_index: if tag_words[idx] == 0: tag_words[idx] = rg_stop rg_stop += 1 return scores_stops, tag_words
def get_frequencies(word_dict, date, max_date): stemmer = snowballstemmer.stemmer('dutch') freqs = defaultdict(lambda: defaultdict(int)) freqs_per_day = defaultdict(lambda: defaultdict(int)) oneday = datetime.timedelta(1) # print("min", date, "\nmax", max_date) current_year = date.year print("current year", current_year) while date <= max_date: # print("current date", date) has_file = True if current_year!=date.year: current_year = date.year print("current year", current_year) try: f_in = open(input_path+r"\words"+str(date)+".txt", "r") except IOError: # print("File not found\n", input_path+r"\words"+str(date)+".txt") has_file = False if has_file: for line in f_in: line = line.replace("\n", "") line = line.split(";") word = line[0] freq = int(line[1]) if use_stemmer: word = stemmer.stemWord(word) if word_dict[word][0]: if log_correlation: freqs[word][date] = freq freqs_per_day[date][word] = freq f_in.close() date+=oneday return freqs, freqs_per_day
def stemming(lang, input, output, encoding, pretty): result = [] stemmer = snowballstemmer.stemmer(lang) for original in codecs.open(input, "r", encoding).readlines(): original = original.strip() stemmed = stemmer.stemWord(original) if result: result.append('\n') if pretty == 0: if stemmed != "": result.append(stemmed) elif pretty == 1: result.append(original, " -> ", stemmed) elif pretty == 2: result.append(original) if len(original) < 30: result.append(" " * (30 - len(original))) else: result.append("\n") result.append(" " * 30) result.append(stemmed) outfile = codecs.open(output, "w", encoding) outfile.write(''.join(result) + '\n') outfile.close()
def init(self, options): self.stemmer = snowballstemmer.stemmer('russian')
def lemmatizer(self ,word): stemmer = snowballstemmer.stemmer('spanish'); return stemmer.stemWord(word)
def init(self, options): # type: (Any) -> None self.stemmer = snowballstemmer.stemmer('italian')
#!/usr/bin/python # -*- coding: iso-8859-9 -*- import argparse, os, re, sys, operator, math import snowballstemmer FILE_ENCODING = "windows-1254" SMOOTHING_CONST = 0.1 STEMMING = True # Create stemmer stemmer = snowballstemmer.stemmer("turkish") # prior_prob is a dictionary contains prior probabilities of authors prior_prob = dict() # word_prob is a dictionary contains dictionaries of authors which include word probabilities word_prob = dict() # Dictionary includes the number of words in the all training data of each author total_words = dict() # total_docs is the number of documents in training set total_docs = 0 # authors is the list contains all author names authors = [] # unknown word probabilities for each author unknown_prob = dict()
def init(self, options): # type: (Any) -> None self.stemmer = snowballstemmer.stemmer('danish')
def init(self, options): # type: (Any) -> None self.stemmer = snowballstemmer.stemmer('portuguese')
# coding=utf-8 import re import codecs import cPickle as pickle import pymorphy2 from snowballstemmer import stemmer __author__ = 'annie' __morph = pymorphy2.MorphAnalyzer() __stemmer = stemmer('russian') __pattern = re.compile(u'(?u)[A-zА-я]{2,}') open_read = lambda file: codecs.open(file, encoding='utf-8', mode='r') open_write = lambda file: codecs.open(file, encoding='utf-8', mode='w') def str_dict(dict_): """ Right conversion dict to string. Without nesting level of values. :param dict_: some dict :return: str """ ans = [u'{0}: {1}'.format(k, v) for k, v in sorted(dict_.items())] return u'\n'.join(ans) def print_dict(dict_): print(str_dict(dict_), u'\n\n{0} ключей'.format(len(dict_)))