Пример #1
0
    def learn(self, class_name):
        self.classes.add(class_name)
        print class_name
        self.words_freq[class_name] = {}
        if class_name is "internet":
            dir_name = learn_internet
        else:
            dir_name = learn_nointernet

        for file_name in os.listdir(dir_name):
            print "processing", file_name
            text = open(dir_name + "/" + file_name, "r").read().decode("utf-8")
            words = [word.lower() for word in tokenizers.extract_words(text)]
            self.docs_number += 1
            self.unique_words_set = self.unique_words_set | set(words)
            stemmer = RussianStemmer()
            for word in words:
                stemmed = stemmer.stem(word)
                if stemmed in self.words_freq[class_name]:
                    self.words_freq[class_name][stemmed] += 1
                else:
                    self.words_freq[class_name][stemmed] = 1

            if class_name in self.words_in_class:
                self.words_in_class[class_name] += len(words)
                self.docs_in_class[class_name] += 1
            else:
                self.words_in_class[class_name] = len(words)
                self.docs_in_class[class_name] = 1
Пример #2
0
    def learn(self, class_name):
        self.classes.add(class_name)
        print class_name
        self.words_freq[class_name] = {}
        if class_name is "internet":
            dir_name = learn_internet
        else:
            dir_name = learn_nointernet

        for file_name in os.listdir(dir_name):
            print "processing", file_name
            text = open(dir_name + "/" + file_name, "r").read().decode("utf-8")
            words = [word.lower() for word in tokenizers.extract_words(text)]
            self.docs_number += 1
            self.unique_words_set = self.unique_words_set | set(words)
            stemmer = RussianStemmer()
            for word in words:
                stemmed = stemmer.stem(word)
                if stemmed in self.words_freq[class_name]:
                    self.words_freq[class_name][stemmed] += 1
                else:
                    self.words_freq[class_name][stemmed] = 1

            if class_name in self.words_in_class:
                self.words_in_class[class_name] += len(words)
                self.docs_in_class[class_name] += 1
            else:
                self.words_in_class[class_name] = len(words)
                self.docs_in_class[class_name] = 1
Пример #3
0
    def textrank(self, text, similar='serense'):
        text = treatment_text(text)
        text = text.split('.')
        text = list(filter(lambda x: len(x.split()) > 6, text))
        text = '.'.join(text)

        sentences = sent_tokenize(text)
        tokenizer = RegexpTokenizer(r'\w+')
        lmtzr = RussianStemmer()
        words = [
            set(
                lmtzr.stem(word)
                for word in tokenizer.tokenize(sentence.lower()))
            for sentence in sentences
        ]

        pairs = combinations(range(len(sentences)), 2)
        if similar == 'serense':
            scores = [(i, j, self.similarity_1(words[i], words[j]))
                      for i, j in pairs]
        if similar == 'cos':
            scores = [(i, j, self.similarity_2(words[i], words[j]))
                      for i, j in pairs]

        scores = filter(lambda x: x[2], scores)

        g = nx.Graph()
        g.add_weighted_edges_from(scores)
        pr = nx.pagerank(g)

        return sorted(
            ((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
            key=lambda x: pr[x[0]],
            reverse=True)
Пример #4
0
 def fillDicts(self, maxDocs=0):
     self.classes = set()
     self.documentsInClass = dict()  #количество документов в классе
     self.documentsNumber = 0  # число документов
     self.uniqueWords = set()  # множество уникальных слов
     self.wordsInClass = dict()  # количество слов в классе
     self.wordsFreq = dict()  # частота появления слова в классе
     i = 0
     for document in self.collection.find():
         i += 1
         if i > maxDocs and maxDocs > 0:
             break
         if i % 100 == 0:
             print "Processed " + str(i) + " documents"
         self.classes.add(document['topic'])
         match = re.findall(re.compile(u"[а-яА-Яa-zA-Z0-9]*"), document['body'])
         match = [word for word in match if word != '']
         self.documentsNumber += 1
         self.uniqueWords = self.uniqueWords | set(match)
         wordsFreq = dict()
         stemmer = RussianStemmer()
         for _match in match:
             stemmed = stemmer.stem(_match)
             if stemmed in wordsFreq:
                 wordsFreq[stemmed] += 1
             else:
                 wordsFreq[stemmed] = 1
         if document['topic'] in self.wordsInClass:
             self.wordsInClass[document['topic']] += len(match)
             self.wordsFreq[document['topic']].update(wordsFreq)
             self.documentsInClass[document['topic']] += 1
         else:
             self.wordsInClass[document['topic']] = len(match)
             self.wordsFreq[document['topic']] = wordsFreq
             self.documentsInClass[document['topic']] = 1
Пример #5
0
    def parse(self, fname):
        """
        Парсинг текста файла
        :param fname: имя файла
        :return: (<имя_файла>, тошнота, мошенничество)
        """
        density, fraud = 0, 0
        with codecs.open(fname, "r", encoding="utf-8") as f:
            text = f.read()
        tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+")
        txt_list = tknz.tokenize(text)
        if txt_list:
            for i, word in enumerate(txt_list):
                new_word = self.check_word(word)
                if new_word:
                    txt_list[i] = new_word
                    fraud += 1

            txt_list = [
                word.lower() for word in txt_list
                if not (word.lower() in self.sw)
            ]
            stemmer_ru = RussianStemmer()
            txt_list = [
                stemmer_ru.stem(token.lower()) for token in txt_list
                if len(token) > 1
            ]
            dict_w = Counter(txt_list)
            top5 = heapq.nlargest(5, dict_w, key=dict_w.get)
            top5_count = sum([dict_w[word] for word in top5])
            density = top5_count / len(txt_list)
        # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке
        # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать
        # готов обсуждать этот критерий, возможно исправить каким то образом
        return fname, density, fraud > 2
Пример #6
0
    def parse_text(self, text):
        text = list(text)

        for i in range(len(text)):
            is_cyrillic_symbol = False
            if text[i] >= 'А' and text[i] <= 'Я':
                is_cyrillic_symbol = True
            if text[i] >= 'а' and text[i] <= 'я':
                is_cyrillic_symbol = True

            if is_cyrillic_symbol == False:
                text[i] = ' '

        text = ''.join(text)
        text = text.split()
        filtered_words = [
            word for word in text if word not in stopwords.words('russian')
            and word not in self.badwords
        ]

        stemmer = RussianStemmer()

        for i in range(len(filtered_words)):
            filtered_words[i] = stemmer.stem(filtered_words[i])

        return filtered_words
Пример #7
0
def stem_corpus(input_path, output_path):
    stem = RussianStemmer()
    last_word = ''

    i = 0
    with open(output_path, 'w', encoding='utf8') as o:
        with open(input_path, 'r', encoding='utf8') as f:
            while True:
                s = f.read(1024 * 1024)
                if not s or not len(s):
                    o.write(last_word)
                    break

                words = s.split(' ')

                if s[0] != ' ':
                    # last_word was incomplete
                    words[0] = last_word + words[0]

                for word in words[:-1]:
                    stemmed = stem.stem(word)
                    o.write(stemmed + ' ')

                i += 1
                print('Stemmed {} MBs'.format(i))

                last_word = words[-1]
Пример #8
0
def stemming_sent(sent):
    pattern = re.compile('[a-zA-Zа-яА-Я]+')
    words = pattern.findall(sent)
    stemmer = RussianStemmer()
    words = list(map(lambda word: stemmer.stem(word), words))
    new_sent = functools.reduce(lambda x, y: x + ' ' + y, words)
    return new_sent
Пример #9
0
def textToWordList(txt):
    p_stemmer = RussianStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
    r = re.compile('^[а-я]+$')
    badword =[
        'дом',
        'город',
        "дорог",
        "час",
        "ноч",
        "слов",
        "утр",
        "стран",
        "пут",
        "путешеств",
        "мест",
        'нов',
        "друз",
        "добр"
    ]
    txt = txt.lower().replace("<br>", "\n")
    tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
    tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
    return tokens
Пример #10
0
def stemming(corpus):
    stemmer = RussianStemmer()
    stems = []
    for comment in corpus:
        comment = comment.split()
        s = [stemmer.stem(word) for word in comment]
        stems.append(' '.join(s))
    return stems
Пример #11
0
def method2(tokens):
    print("The way 2")
    stemmer = RussianStemmer(False)
    dictionary = dict()
    for word in tokens:
        normal_form = stemmer.stem(word)
        dictionary[normal_form] = dictionary.get(normal_form, 0) + 1
    printDic(dictionary, 2)
Пример #12
0
def preprocessing(sentence):
    porter = RussianStemmer()
    punctuation = string.punctuation + "«»—•’"
    stop = stopwords.words('russian')

    for p in punctuation:
        sentence = sentence.replace(p, "")
    sentence = [porter.stem(word) for word in sentence.split() if word not in stop]
    return sentence
def build_stemmer_morphology(data_filename, output_filename):
  vocab = load_vocab(data_filename)

  print 'Total words in vocab: %d' % len(vocab)
  prefix_map = defaultdict(set)
  stemmer = RussianStemmer()
  for w in vocab:
    prefix_map[stemmer.stem(w)].add(w)
  print 'Total lemm groups: %d' % (len(prefix_map))
  write_morphology(prefix_map, output_filename)
Пример #14
0
 def __init__(self):
     self.stop_words = list(set(stopwords.words('russian')).union(set(stopwords.words('english'))))
     self.vectorizer = CountVectorizer(max_df=0.75)
     self.transformer = TfidfTransformer()
     self.scaler = MaxAbsScaler()
     self.classifier = LogisticRegression()
     self.swearings_list = []
     self.stemmer = RussianStemmer()
     with open('swearings.txt', 'r') as file:
         self.swearings_list = list(map(self.stemmer.stem, file.read().split()))
Пример #15
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     choices = hay_forms.model_choices()
     self.fields["models"] = forms.ChoiceField(choices=choices,
                                               required=False,
                                               label='Искать',
                                               widget=forms.RadioSelect,
                                               initial=choices[0][0])
     self.stopwords = set(stopwords.words('russian'))
     self.stemmer = RussianStemmer()
     self.tokenizer = RegexpTokenizer(r'\w+')
Пример #16
0
def nltk_preprocessor(sentences):
    ''' токенизация + стемминг'''
    
    tokenizer = RegexpTokenizer(r'\w+')
    # стемминг до корневой основы
    lmtzr = RussianStemmer()
    words = [set(lmtzr.stem(word)                                # стемминг
                for word in tokenizer.tokenize(sentence.lower()) # токенизация
             )
             for sentence in sentences
    ]
    return words
Пример #17
0
def calculate_class_score(sentence, class_name, show_details=False):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if RussianStemmer().stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += (1 / corpus_words[RussianStemmer().stem(word.lower())])

            if show_details:
                print("   match: %s" % RussianStemmer().stem(word.lower()))
    return score
Пример #18
0
 def __init__(self, stopwords, ignorechars, docs):
     self.stemmer = RussianStemmer()
     self.wdict = {}
     self.dictionary = []
     self.stopwords = stopwords
     if type(ignorechars) == unicode:
         ignorechars = ignorechars.encode('utf-8')
     self.ignorechars = ignorechars
     self.docss = []
     self.docs = docs
     for doc in docs:
         self.add_doc(doc)
Пример #19
0
def detect_cheat_in_text(text):
    """Detect cheats in text"""
    new_text = []
    is_cheat = False
    for word in text:
        is_cheated_word, recovery_token = detect_cheat(word)
        if is_cheated_word:
            is_cheat = True
            new_text.append(recovery_token)
    stop_words = set(stopwords.words('russian'))

    st = RussianStemmer()

    new_text = [word for word in new_text if (word not in stop_words)]
    return is_cheat, [st.stem(word) for word in new_text]
Пример #20
0
def tokenize(text):
    def is_ok(item, stemmer):
        return True if item.lower() == item and all((elem.isalpha() and not elem in string.ascii_letters and not stemmer.stem(item) in stopwords) for elem in item) else False
    from nltk.stem.snowball import RussianStemmer
    stemmer = RussianStemmer(ignore_stopwords=True)
    tokens = word_tokenize(text)
    return [item for item in tokens if is_ok(item, stemmer)]
Пример #21
0
    def test(self, mode, bpe_model_path=None):
        while True:
            file_path = input("File path: ").strip()
            file_path = r"C:\Users\lezgy\OneDrive\Рабочий стол\Data_summ\data.txt"
            if file_path == "q":
                break
            try:
                with open(file_path, "r", encoding="utf-8") as r:
                    article = r.read().strip().split("\n")
                    article = " ".join(article)
                    if mode in ["lemm", "stem", "gram", "base"]:
                        article = article.lower()
                        article = word_tokenize(article)
                        article = " ".join(article)
                    print(f"real_text : {article}")

                if mode == "lemm":
                    lemmatizer = mystem.Mystem()
                    article = preprocess_lemm(article, lemmatizer)
                elif mode == "stem":
                    stemmer = RussianStemmer(False)
                    article = preprocess_stemm(article, stemmer)
                elif mode == "gram":
                    token_model = youtokentome.BPE(model=bpe_model_path)
                    article = preprocess_gramm(article, token_model)
                self.test_calc(article)
            except Exception as e:
                print(e)
                print("File not found")
Пример #22
0
class Tokenizer(object):
    def __init__(self):
        self.cache = {}
        self.r_stemmer = RussianStemmer()
        self.e_stemmer = EnglishStemmer()

    def process_word(self, w):
        if w in self.cache:
            return self.cache[w]
        else:
            struct = check_structure(w)
            if struct == 'TRASH':
                w_proc = ''
            elif struct == 'WORD':
                if is_ascii(w):
                    w_proc = self.e_stemmer.stem(w)
                else:
                    w_proc = self.r_stemmer.stem(w)
            elif struct == 'NUMBER':
                w_proc = ''
            elif struct == 'COMPLEX':
                w_proc = w
            self.cache[w] = w_proc
            return w_proc

    def tokenize(self, text):
        text = preprosess_text(text)
        words = text.split(' ')
        tokens = []
        for w in words:
            tokens.append(self.process_word(w))
        tokens = [t for t in tokens if len(t)]
        return tokens
 def __init__(self):
     self.model = joblib.load("./models/clf.pkl")
     self.vectorizer = joblib.load("./models/vectorizer.pkl")
     self.classes_dict = {
         0: "отрицательный",
         1: "положительный",
         -1: "ошибка"
     }
     self.numbers_str = '0123456789'
     self.punc_translator = str.maketrans(string.punctuation,
                                          ' ' * len(string.punctuation))
     self.num_translator = str.maketrans(self.numbers_str,
                                         ' ' * len(self.numbers_str))
     self.short_word_len = 1
     self.stemmer = RussianStemmer()
     self.stop_words = stopwords.words('russian') + ['br']
Пример #24
0
    def __init__(self):
        self.words = set()
        self.problems = {}
        self.appearances = {}

        self.filter = Filter()
        self.stemmer = RussianStemmer()
Пример #25
0
    def stem_words(self, words):
        """ Stem words by Porter or Snowball stemmers and join to one string """

        stemmer = None

        if self.lang == 'uk':
            return ' '.join(
                [UkrainianStemmer(word).stem_word() for word in words])

        elif self.lang == 'ru':
            stemmer = RussianStemmer()

        elif self.lang == 'en':
            stemmer = EnglishStemmer()

        return ' '.join([stemmer.stem(word) for word in words])
    def prep_stem(self, text):
        """
        Eng:
        ============================================================================
        :param text: Text for preprocessing;

        :return: Preprocessed text with all stemmed words.

        Stem all words with Porter stemmer.
        ============================================================================

        Ru:
        ============================================================================
        :param text: Текст для предобработки;

        :return: Обработанный текст, в котором каждое слово подвергнулось стеммингу.

        Стеммингует все слова с помощью стеммера Портера.
        ============================================================================
        """
        if isinstance(text, str):
            if self.lang == "ru":
                return " ".join(
                    [RussianStemmer().stem(word) for word in text.split()])
            return " ".join(
                [PorterStemmer().stem(word) for word in text.split()])
        else:
            raise TypeError("Argument must be str!")
Пример #27
0
class PhraseStemmer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.stemmer = RussianStemmer()

    def tokenize(self, phrase):
        return [self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase) if len(w.strip()) > 0]
Пример #28
0
def textrank(text):
    sentences = sent_tokenize(text)
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = RussianStemmer()
    words = [
        set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower()))
        for sentence in sentences
    ]
    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)
    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)
    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]],
                  reverse=True)
Пример #29
0
    def stem_keyword(self):
        """ Stem keyword by Porter or Snowball stemmers """

        if self.language == 'uk':
            self.keyword = UkrainianStemmer(self.keyword).stem_word()
            return

        elif self.language == 'ru':
            stemmer = RussianStemmer()

        elif self.language == 'en':
            stemmer = EnglishStemmer()

        else:
            return

        self.keyword = stemmer.stem(self.keyword)
Пример #30
0
 def __init__(self, vocabulary_size=5000, debug=False):
     self.stemmer = RussianStemmer()
     self.stem_count = Counter()
     self.validator_regex = re.compile(r'[^А-яЁё]')
     self.cache_stems = {}
     self.vocabulary = None
     self.vocabulary_size = vocabulary_size
     self.debug = debug
     self.positive_tweets = None
     self.negative_tweets = None
     self.tweets_vectors = None
     self.labels = None
     self.x_train = None
     self.y_train = None
     self.x_test = None
     self.y_test = None
     self.model = None
Пример #31
0
def wrk_words_wt_no(sent):

    """Making stemming"""
#     morph = pymorphy2.MorphAnalyzer()
    stemmer = RussianStemmer()
    words=word_tokenize(sent.lower())
    try:
        arr=[]
        for i in range(len(words)):
            if re.search(u'[а-яА-Я]',words[i]):
                arr.append(stemmer.stem(words[i]))###стемминг
#                 arr.append(morph.parse(words[i])[0].normal_form)###лемматизация
        words1=[w for w in arr if w not in russian_stops]
        words1=No_with_word(words1)
        return words1
    except TypeError:
        pass
Пример #32
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.inside_dd = False
     self.doc_id = 0
     self.token_count = 0
     self.token_sum_len = 0      
     self.iindex = {}
     self.paragraphs = []
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = RussianStemmer()
Пример #33
0
   def cleanText(textToClean):
        myPunctuation = u'–«»—…'
        exclude = set(string.punctuation + myPunctuation)

        #textToClean = unicode(textToClean, "utf-8")
        textToClean = ''.join(ch for ch in textToClean if ch not in exclude)
        textToClean = ''.join([i for i in textToClean if not i.isdigit()])

        stop_words = get_stop_words('ru')
        words_after_deleting_stop_words = [w for w in textToClean.split()
                                           if (not w in stop_words and w in model.vocab)]

        rs = RussianStemmer()
        words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words]
        text_after_cleaning = ' '.join(words_after_stemming)
        #text_after_cleaning = text_after_cleaning.replace(u'кпм', '').replace(u'пмп', '')

        if text_after_cleaning:
            return text_after_cleaning
Пример #34
0
def normailize_text(
        data,
        tok=RegexpTokenizer(r'\w[\w\/\-]+'),
        stemmers=[RussianStemmer(ignore_stopwords=True), PorterStemmer()]
):
    # tokenize text into words
    # sequentially apply all stemmers to tokenized words
    # join stemmed words back to sentences
    return [' '.join([reduce(lambda v,f: f.stem(v), stemmers, w) for w in tok.tokenize(line)])
            for line in data]
Пример #35
0
def index(pathh):
    cont = txt_reader(pathh)
    cont = txt_parser(cont)
    stem = RussianStemmer(False)
    stemmed_text = text_stemmer(cont, stem)
    token = stemmed_tokenizer(stemmed_text)
    token.append([])
    vect_tfidf = bool_tf_tfidf(token)[2]
    csv_safe(vect_tfidf)
    return vect_tfidf
Пример #36
0
class findSubject:
    SUBJECTS_NAME = []
    SUBJECTS_REAL_NAME = []
    IS_LOADED_SUBJECTS = False

    regex = 0
    stemer = 0

    def __init__(self):
        self.stemer = RussianStemmer()
        self.regex = re.compile('[^а-яА-Я ]')
        self.load_subjects('textParsing/data/subjects.csv')

    def get_stem(self, token, checkHash=True):
        token = self.regex.sub('', token).lower()
        stem = self.stemer.stem(token)
        return stem

    def load_subjects(self, filepath):
        pd_subjects = pd.read_csv(filepath, delimiter=';')
        self.SUBJECTS_NAME = list(np.array(pd_subjects[['name']]))
        self.SUBJECTS_REAL_NAME = list(np.array(pd_subjects[['subject']]))

        for ind in range(len(self.SUBJECTS_NAME)):
            self.SUBJECTS_NAME[ind] = self.get_stem(
                str(self.SUBJECTS_NAME[ind][0]), False)

        self.IS_LOADED_SUBJECTS = True

    def get(self, text):

        sent = text.split(' ')
        find_fst_po = -1

        for ind, word in enumerate(sent):
            if word == 'по':
                find_fst_po = ind
                break
        if (find_fst_po == -1):
            return None

        subjects = set()

        for ind, word in enumerate(sent):
            if (ind > find_fst_po):
                word = self.get_stem(word, False)
                if (word in self.SUBJECTS_NAME):
                    subjects.add(
                        str(self.SUBJECTS_REAL_NAME[self.SUBJECTS_NAME.index(
                            word)]))

        if (len(subjects) == 0):
            return None

        return subjects
Пример #37
0
def textrank(text):
    """
    TextRank algorithm for text summarization.
    https://gist.github.com/igor-shevchenko/5821166
    """
    sentences = sent_tokenize(text)
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = RussianStemmer()
    words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
Пример #38
0
 def __init__(self, stopwords, ignorechars, docs):
     self.stemmer = RussianStemmer()
     self.wdict = {}
     self.dictionary = []
     self.stopwords = stopwords
     if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8')
     self.ignorechars = ignorechars
     self.docss = []
     self.docs = docs
     for doc in docs:
         self.add_doc(doc)
Пример #39
0
 def __init__(self):
     """
     vect_theme - векторизатор для строк-тем
     vect_poem - векторизатор для строк-стихов
     lin_model - обученная модель логрегрессии
     """
     self.vect_theme = None
     self.vect_poem = None
     self.lin_model = None
     self.stemmer = RussianStemmer(True)
     self.stop_w = stopwords.words('russian')
Пример #40
0
    def learn(self, class_name):
        dir_name = "."
        file_name = "tweets_by_trend.xml"

        self.classes.add(class_name)
        self.words_freq[class_name] = {}

        if class_name is "negative":
            code = 0
        else:
            code = 1

        print "processing", file_name

        tree = ET.parse(dir_name + "/" + file_name)
        root = tree.getroot()
        for tweet in root.findall('tweet'):
            sent = int(tweet.find('sent').text)
            if sent == code:
                text = tweet.find('text').text
                words = [word.lower() for word in tokenizers.extract_words(text)]
                self.docs_number += 1
                self.unique_words_set = self.unique_words_set | set(words)
                stemmer = RussianStemmer()
                for word in words:
                    stemmed = stemmer.stem(word)
                    if stemmed in self.words_freq[class_name]:
                        self.words_freq[class_name][stemmed] += 1
                    else:
                        self.words_freq[class_name][stemmed] = 1

                    if class_name in self.words_in_class:
                        self.words_in_class[class_name] += len(words)
                        self.docs_in_class[class_name] += 1
                    else:
                        self.words_in_class[class_name] = len(words)
                        self.docs_in_class[class_name] = 1
Пример #41
0
def cleanText(text):
    '''
     Function checks and repairs words with hidden latin characters in and vv.
     Function assuming that there are only latin and cyrillic characters
     in text.
    '''

    ad = AlphabetDetector()
    st = RussianStemmer()
    is_broken = False

    clean_text = []

    for word in text:
        if ad.only_alphabet_chars(word, 'CYRILLIC'):
            clean_text.append(word)
        elif ad.only_alphabet_chars(word, 'LATIN'):
            clean_text.append(word)
        else:
            is_broken = True
            clean_text.append(letterSwap(word))

    clean_text = [st.stem(word) for word in clean_text]
    return clean_text, is_broken
Пример #42
0
class LSI(object):
    def __init__(self, stopwords, ignorechars, docs):
        self.stemmer = RussianStemmer()
        self.wdict = {}
        self.dictionary = []
        self.stopwords = stopwords
        if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8')
        self.ignorechars = ignorechars
        self.docss = []
        self.docs = docs
        for doc in docs:
            self.add_doc(doc)

    def prepare(self):
        self.build()
        self.calc()

    def dic(self, word, add = False):
        if type(word) == unicode: word = word.encode('utf-8')
        word = word.lower().translate(None, self.ignorechars)
        word = word.decode('utf-8')
        word = self.stemmer.stem(word)
        if word in self.dictionary: return self.dictionary.index(word)
        else:
            if add:
                self.dictionary.append(word)
                return len(self.dictionary) - 1
            else: return None

    def add_doc(self, doc):
        words = [self.dic(word, True) for word in doc.lower().split()]
        self.docss.append(words)
        for word in words:
            if word in self.stopwords:
                continue
            elif word in self.wdict:
                self.wdict[word].append(len(self.docs) - 1)
            else:
                self.wdict[word] = [len(self.docs) - 1]

    def build(self):
        self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0]
        self.keys.sort()
        self.A = np.zeros([len(self.keys), len(self.docs)])
        for i, k in enumerate(self.keys):
            for d in self.wdict[k]:
                self.A[i,d] += 1

    def calc(self):
        self.U, self.S, self.Vt = svd(self.A)

    def TFIDF(self):
        wordsPerDoc = sum(self.A, axis=0)
        docsPerWord = sum(np.asarray(self.A > 0, 'i'), axis=1)
        rows, cols = self.A.shape
        for i in range(rows):
            for j in range(cols):
                self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i])

    def dump_src(self):
        self.prepare()
        print 'Здесь представлен расчет матрицы '
        for i, row in enumerate(self.A):
            print self.dictionary[i], row

    def print_svd(self):
        self.prepare()
        print 'Здесь сингулярные значения'
        print self.S
        print 'Здесь первые 3 колонки U матрица '
        for i, row in enumerate(self.U):
            print self.dictionary[self.keys[i]], row[0:3]
        print 'Здесь первые 3 строчки Vt матрица'
        print -1*self.Vt[0:3, :]

    def find(self, word):
        self.prepare()
        idx = self.dic(word)
        if not idx:
            print 'слово невстерчается'
            return []
        if not idx in self.keys:
            print 'слово отброшено как не имеющее значения которое через stopwords'
            return []
        idx = self.keys.index(idx)
        print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n'
        # получаем координаты слова
        wx, wy = (-1 * self.U[:, 1:3])[idx]
        print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word)
        arts = []
        xx, yy = -1 * self.Vt[1:3, :]
        for k, v in enumerate(self.docs):
            ax, ay = xx[k], yy[k]
            dx, dy = float(wx - ax), float(wy - ay)
            arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy)))
        return sorted(arts, key = lambda a: a[4])
Пример #43
0
    new_d.pop(key)
    return new_d


# load pazans
pazans_groups = None

pazans_file_name = sys.argv[1]
with open(pazans_file_name, "r") as pazans_file:
    pazans_groups = json.loads(pazans_file.read())

# analyze statues
status_stats = dict()

tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
stemmer   = RussianStemmer()

users_file_name = sys.argv[2]
with open(users_file_name, "r") as users_file:
    for line in users_file:
        user = json.loads(line)
        uid = str(user["_id"])
        if uid in pazans_groups:
            pazan_groups = pazans_groups[uid]
            status_text  = user.get("status", "")
            filtered_status_text = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(status_text)])
            if len(filtered_status_text) > 1:
                status_stats_item = status_stats.get(filtered_status_text, {
                    "full": status_text,
                    "count-boys": 0,
                    "count-girls": 0,
Пример #44
0
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC,NuSVC,LinearSVC
from sklearn.neural_network import BernoulliRBM
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD,NMF,FactorAnalysis,PCA
from nltk.stem.snowball import RussianStemmer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from nltk import word_tokenize
from nltk.tokenize.api import StringTokenizer
from nltk.corpus import stopwords
import numpy
st=RussianStemmer()
libra=pd.read_excel('libra.xls')[['body','ticket_queue_id']].dropna()
libra.body=pd.Series(st.stem(x) for x in libra.body)

    
    
libra=libra.dropna()


classifier=SVC(probability=True,kernel='linear')
from nltk.stem.snowball import RussianStemmer
import nltk
st = RussianStemmer()
       

    
Пример #45
0
from decimal import *

reload(sys)
sys.setdefaultencoding("utf-8")
from stop_words import get_stop_words

# next line delete file content
open('text_after_cleaning.csv', 'w').close()
with open('text_after_cleaning.csv', 'w') as data_csv:
    fieldnames = ['post_text', 'stars']
    writer = csv.DictWriter(data_csv, fieldnames=fieldnames)
    writer.writeheader()

    with open('items.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        myPunctuation = u'–«»'
        exclude = set(string.punctuation+myPunctuation)

        for row in reader:
            text_before_cleaning = row['post_text']
            post_text = row['post_text']
            post_text = unicode(post_text, "utf-8")
            post_text = ''.join([i for i in post_text if not i.isdigit()])
            post_words = post_text.split()
            stop_words = get_stop_words('ru')
            words_after_deleting_stop_words = [w for w in post_text.split() if not w in stop_words]
            rs = RussianStemmer()
            words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words]
            text_after_cleaning = ' '.join(words_after_stemming)
            if text_after_cleaning:
                writer.writerow({'post_text': text_after_cleaning, 'stars': row['stars']})
Пример #46
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 21 18:10:41 2016

@author: asamoylov
"""

from nltk.stem.snowball import RussianStemmer

mystem = RussianStemmer()

str0 = "поздно"

print mystem.stem(str0.decode("utf-8"))


Пример #47
0
class KareninaParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.inside_dd = False
        self.doc_id = 0
        self.token_count = 0
        self.token_sum_len = 0      
        self.iindex = {}
        self.paragraphs = []
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = RussianStemmer()


    def handle_starttag(self, tag, attrs):
        if tag == "dd":
            self.inside_dd = True
            self.doc_id += 1
        else:
           self.inside_dd = False


    def handle_data(self, data):
        if self.inside_dd:
            self.paragraphs.append(data)
            terms = set()
            for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')):
                if token[0] in string.punctuation:
                    continue

                self.token_count += 1
                self.token_sum_len += len(token)                   

                term = self.stemmer.stem(token)                  

                if not term in terms:
                    terms.add(term)
                    if self.iindex.has_key(term):
                        self.iindex[term].append(self.doc_id)
                    else:
                        self.iindex[term] = [ self.doc_id ]


    def dump_iindex(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.iindex, output)
        output.close()


    def dump_paragraphs(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.paragraphs, output)
        output.close()


    def get_stat(self):
        term_sum_len = 0
        for term in self.iindex.keys():
            term_sum_len += len(term)

        term_count = len(self.iindex.keys())
        
        if not (term_count and self.token_count):
            self.stat = {}

        else:
            self.stat = {
                'token_count': self.token_count,
                'token_avg_len': self.token_sum_len/float(self.token_count),
                'term_count': term_count,
                'term_avg_len': term_sum_len/float(term_count)
            }

        return self.stat


    def print_iindex(self):
        for term in sorted(self.iindex.keys()):
            posting_list = self.iindex[term]
            print term
            print len(posting_list)
            print posting_list
            print '---------------------'
Пример #48
0
def stemData(posts):
    global happy
    global sad
    global invert
    
    global shouldStemData
    
    statHap = {}
    statSad = {}
    statAll = {}
    
    from nltk.stem.snowball import RussianStemmer
    from nltk import word_tokenize, sent_tokenize
    from gensim.models.doc2vec import LabeledSentence
    stemmer = RussianStemmer()
    toRet = []
    curI = 0
    if shouldStemData:
        # renew smiles
        happy = stemmer.stem(happy)
        sad = stemmer.stem(sad)
    positives = []
    negatives = []
    for i in range(0, len(posts)):
        if i % 10000 == 0:
            print i
        sentences = sent_tokenize(posts[i])
        for j in range(0, len(sentences)):
            words = word_tokenize(sentences[j])
            import string
            for k in range(0, len(words)):
                try:
                    if shouldStemData and words[k] not in invert:
                        words[k] = unicode(stemmer.stem(words[k]))
                    # words[k] = cyr_to_r(words[k]).encode('utf8')
                    letters = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
                    words[k] = filter(lambda x: x in letters + string.letters + string.digits + '.!?', words[k])
                except Exception:
                    print 'failed word: ' + words[k]
                    raise Exception('')
            try:
                if words == []:
                    del sentences[j]
                    continue
                if words == [happy, '.']:
                    sentences[j] = LabeledSentence(words=words, tags=[happy])
                    if j > 0:
                        positives += [curI - 1]
                elif words == [sad, '.']:
                    sentences[j] = LabeledSentence(words=words, tags=[sad])
                    if j > 0:
                        negatives += [curI - 1]
                else:
                    for word in words:
                        if word in statAll:
                            statAll[word] += 1
                        else:
                            statAll[word] = 1
                    if happy in words:
                        positives += [curI]
                        while happy in words:
                            words.remove(happy)
                        for word in words:
                            if word in statHap:
                                statHap[word] += 1
                            else:
                                statHap[word] = 1
                    if sad in words:
                        negatives += [curI]
                        while sad in words:
                            words.remove(sad)
                        for word in words:
                            if word in statSad:
                                statSad[word] += 1
                            else:
                                statSad[word] = 1
                    sentences[j] = LabeledSentence(words=words, tags=[str(curI)])
                curI += 1
            except Exception, e:
                print words
                sentences[j] = ['']
                raise e
        toRet += sentences