def forwards(self, orm): "Write your forwards methods here." for song in Song.objects.all(): to_write = list() try: for i, word in enumerate(extract_words(song.lyrics)): for term in morph.normalize(word.upper()): to_write.append('1 ' + str(i) + " " + term) except TypeError: pass try: for i, word in enumerate(extract_words(song.artist)): for term in morph.normalize(word.upper()): to_write.append('2 ' + str(i) + " " + term) except TypeError: pass try: for i, word in enumerate(extract_words(song.title)): for term in morph.normalize(word.upper()): to_write.append('3 ' + str(i) + " " + term) except TypeError: pass try: for i, word in enumerate(extract_words(song.linked_movie)): for term in morph.normalize(word.upper()): to_write.append('4 ' + str(i) + " " + term) except TypeError: pass song.map_of_normalized_words = "|".join(to_write) song.save(update_fields=["map_of_normalized_words"]) print "Done", song.id
def eval_features(self, number, clazz, text): print text features = {} words_count = 0 average_length = 0 latin_words = 0 for word in tokenizers.extract_words(text): words_count += 1 average_length += len(word) if check_latin_word(word): latin_words += 1 sentences = parser.analyze_paragraph(text) features['sentences_count'] = len(sentences) average_sentence_len = 0 for sentence in sentences: average_sentence_len += len(sentence) features['average_sentence_length'] = average_sentence_len / len(sentences) excl_marks = "!" in text features['words_count'] = words_count features['average_length'] = average_length / words_count features['class'] = clazz features['latin_words'] = latin_words features['excl_marks'] = excl_marks features['quotes_count'] = len(self.quotes_pattern.findall(text)) features['digits_count'] = len(self.digit_patter.findall(text)) self.files_features[number] = features
def learn(self, class_name): self.classes.add(class_name) print class_name self.words_freq[class_name] = {} if class_name is "internet": dir_name = learn_internet else: dir_name = learn_nointernet for file_name in os.listdir(dir_name): print "processing", file_name text = open(dir_name + "/" + file_name, "r").read().decode("utf-8") words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def test_exctract_words(self): txt = """Это отразилось: на количественном,и на качествен_ном - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло. -сказал кто-то --нет--""" words = list(extract_words(txt)) self.assertListEqual( words, [ "Это", "отразилось", "на", "количественном", "и", "на", "качествен_ном", "росте", "карельско-финляндского", "сотрудничества", "офигеть", "кони", "лошади", "масло", "сказал", "кто-то", "нет", ], )
def forwards(self, orm): "Write your forwards methods here." all_songs_in_db = Song.objects.all() ids_of_songs = set() for rec in all_songs_in_db: song_id = rec.aid ids_of_songs.add(int(song_id)) csvfile1 = open(FILE_WITH_SONGS_INFO, 'rb') reader1 = csv.reader(csvfile1, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) # Дополняем базу новыми записями added_songs = 0 added_terms = 0 recognized_tokens = 0 for row1 in reader1: # [self.aid, self.artist, self.title, self.duration, self.lyrics, self.url] aid, artist, title, duration, lyrics_id, url = row1[0:6] csvfile2 = open(FILE_WITH_TEXTS_OF_SONGS, 'rb') reader2 = csv.reader(csvfile2, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row2 in reader2: # [id_of_song, flag, text] lyrics_id2, flag, text = row2[0:3] if lyrics_id == lyrics_id2 and flag == 'russian' and int(aid) not in ids_of_songs: # Не встреченная ранее русская песня - заносим в базу s = Song(aid=int(aid), artist=artist, title=title, duration=duration, url=url, lyrics=text) s.save() # Теперь вытащим все токены из текста for word in tokenizers.extract_words(text.decode('utf8')): recognized_tokens += 1 for term in morph.normalize(word.upper()): # Берем все варианты нормализации - т.к. могут быть омонимы try: w = IndexElement.objects.get(term=term) # Слово уже было в обратном индексе except IndexElement.DoesNotExist: w = IndexElement(term=term) w.save() except IndexElement.MultipleObjectsReturned: print "WTF" return w.song.add(s) # Приписали новую ссылку на слово added_terms += 1 added_songs += 1 ids_of_songs.add(int(aid)) break csvfile2.close() csvfile1.close() print "Added songs", added_songs print "Found words", recognized_tokens print "Added terms", added_terms
def test_exctract_words(self): txt = u'''Это отразилось: на количественном,и на качествен_ном - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло. -сказал кто-то --нет--''' words = list(extract_words(txt)) self.assertListEqual(words, [ u'Это', u'отразилось', u'на', u'количественном', u'и', u'на', u'качествен_ном', u'росте', u'карельско-финляндского', u'сотрудничества', u'офигеть', u'кони', u'лошади', u'масло', u'сказал', u'кто-то', u'нет', ])
def forwards(self, orm): "Write your forwards methods here." csvfile1 = open(FILE_WITH_PESNIFILM, 'rb') reader1 = csv.reader(csvfile1, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) # Дополняем базу новыми записями added_songs = 0 added_terms = 0 recognized_tokens = 0 for row1 in reader1: print added_songs # [self.artist, self.title, self.duration, self.url, lyrics, movie] # duration везде 0 artist, title, duration, url, lyrics, movie = row1[0:6] duration = int(duration) # Возможны коллизии по aid (наш диапазон [-10000 : -19999]) - заносим в базу s = Song(aid=-10000-added_songs, artist=artist, title=title, duration=duration, url=url, lyrics=lyrics, linked_movie=movie) s.save() # В индекс не пишем лишнее из авторов ar = artist.replace("исполнение", "").replace("текст", "").replace("музыка", "").replace("слова", "") ar = ar.replace(" и ", " ").replace("автор", "") all_text = lyrics + " " + title + " " + movie + " " + ar print artist # Теперь вытащим все токены из текста for word in tokenizers.extract_words(all_text.decode('utf8')): recognized_tokens += 1 for term in morph.normalize(word.upper()): # Берем все варианты нормализации - т.к. могут быть омонимы try: w = IndexElement.objects.get(term=term) # Слово уже было в обратном индексе except IndexElement.DoesNotExist: w = IndexElement(term=term) w.save() except IndexElement.MultipleObjectsReturned: print "WTF" return w.song.add(s) # Приписали новую ссылку на слово added_terms += 1 added_songs += 1 csvfile1.close() print "Added songs", added_songs print "Found words", recognized_tokens print "Added terms", added_terms
def classify(self, input): words = list() for word in tokenizers.extract_words(input): words.append(word) stemmed = [RussianStemmer().stem(word) for word in words] result = dict() for _class in self.classes: prob = log(float(self.docs_in_class[_class]) / self.docs_number) for word in stemmed: if word in self.words_freq[_class]: wordFreq = self.words_freq[_class][word] else: wordFreq = 0 prob += log(float(wordFreq + 1) / float(len(self.unique_words_set) + self.words_in_class[_class])) result[_class] = prob return result
def classify(self, input): words = list() for word in tokenizers.extract_words(input): words.append(word) stemmed = [RussianStemmer().stem(word) for word in words] result = dict() for _class in self.classes: prob = log(float(self.docs_in_class[_class]) / self.docs_number) for word in stemmed: if word in self.words_freq[_class]: wordFreq = self.words_freq[_class][word] else: wordFreq = 0 prob += log( float(wordFreq + 1) / float( len(self.unique_words_set) + self.words_in_class[_class])) result[_class] = prob return result
def classify(self, text, biggest_class): pos_count = 0 neg_count = 0 for word in tokenizers.extract_words(text): stemmed_word = RussianStemmer().stem(word) if stemmed_word in self.negative_keywords: print "negative word " + stemmed_word + " found in text: " + text neg_count += 1 elif stemmed_word in self.positive_keywords: print "positive word " + stemmed_word + " found in text: " + text pos_count += 1 result = dict() result['pos'] = pos_count result['neg'] = neg_count if result['pos'] == result['neg']: result[biggest_class] += 1 return result
def learn(self, class_name): dir_name = "." file_name = "tweets_by_trend.xml" self.classes.add(class_name) self.words_freq[class_name] = {} if class_name is "negative": code = 0 else: code = 1 print "processing", file_name tree = ET.parse(dir_name + "/" + file_name) root = tree.getroot() for tweet in root.findall('tweet'): sent = int(tweet.find('sent').text) if sent == code: text = tweet.find('text').text words = [ word.lower() for word in tokenizers.extract_words(text) ] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def learn(self, class_name): dir_name = "." file_name = "tweets_by_trend.xml" self.classes.add(class_name) self.words_freq[class_name] = {} if class_name is "negative": code = 0 else: code = 1 print "processing", file_name tree = ET.parse(dir_name + "/" + file_name) root = tree.getroot() for tweet in root.findall('tweet'): sent = int(tweet.find('sent').text) if sent == code: text = tweet.find('text').text words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
# coding=utf-8 __author__ = 'artemii' from pymorphy import get_morph from pymorphy.contrib import tokenizers f = open('negative_words.txt', 'r') resultFile = open('negative_words_normalized.txt', 'a') morph = get_morph('.') #normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8")) #print normalized.pop().lower().encode("utf-8") for line in f: # word = raw_input() words = tokenizers.extract_words(line.decode("utf-8")) word = words.next() normalized = morph.normalize(word.upper()) resultFile.write(normalized.pop().lower().encode("utf-8") + '\n') # print normalized.pop().lower() # for word pairs #for line in f : ## word = raw_input() # words = tokenizers.extract_words(line.decode("utf-8")) # normalized_fst = morph.normalize(words.next().upper()) # normalized_snd = morph.normalize(words.next().upper()) # resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')
def home(request): if request.method == 'GET' and 'search' in request.GET: district = '' regions = [] subways = [] films = [] genres = [] subjects = [] day = datetime.now() string = request.GET['search'] strings = re.split(u'[\.]+|!|\?', string) words = [] for string in strings: for word in tokenizers.extract_words(string): words.append(word.upper()) for word in words: word = word.upper() info = morph.get_graminfo(word) if info[0]['class'] == 'С': if check_object(District, {'name': word.lower()}): district = word.lower() if check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + ')'}) and len( words) > words.index(word) + 1: if search_by_touch(Region, info[0]['norm'], words[words.index(word) + 1]): regions.append(search_by_touch(Region, info[0]['norm'], words[words.index(word) + 1])) elif check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + '$)'}): regions.append(word.lower()) if check_object(Subway, {'name__regex': r'(^' + word.lower() + ' )'}) and len(words) > words.index( word) + 1: query_str = word for word_ in words[words.index(word) + 1:words.index(word) + 2]: if check_object(Subway, {'name__regex': r'(^' + query_str.lower() + '$)'}): subways.append(Subway.objects.get(name=query_str.lower())) break query_str += ' ' + word_ elif check_object(Subway, {'name__regex': r'(^' + word.lower() + '$)'}): subways.append(Subway.objects.get(name=word.lower())) if check_object(Subject, {'name': info[0]['norm'].lower()}): subjects.append(Subject.objects.get(name=info[0]['norm'].lower())) if check_object(Genre, {'name': info[0]['norm'].lower()}): genres.append(Genre.objects.get(name=info[0]['norm'].lower())) elif info[0]['class'] == 'П': if len(words) > words.index(word) + 1: if search_by_touch(Region, word, words[words.index(word) + 1]): regions.append(search_by_touch(Region, word, words[words.index(word) + 1])) if check_object(Subway, {'name__regex': r'(^' + word.lower() + ' )'}) and len( words) > words.index(word) + 1: query_str = word for word_ in words[words.index(word) + 1:words.index(word) + 3]: if check_object(Subway, {'name__regex': r'(^' + query_str.lower() + '$)'}): subways.append(Subway.objects.get(name=query_str.lower())) break query_str += ' ' + word_ elif check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + '$)'}): regions.append(word.lower()) elif check_object(Subway, {'name__regex': r'(^' + word.lower() + '$)'}): subways.append(Subway.objects.get(name=word.lower())) events = [] cinemas = [] for film in Film.objects.filter(genres__in=genres): films.append(film.name) if district != '': district = District.objects.get(name=district) for region in district.region.all(): regions.append(region.name) for name in regions: region = Region.objects.get(name=name) for subway in region.subways.all(): subways.append(subway) for subway in subways: filter_parameters = {'place__geo__x__gt': subway.geo.x - 0.005, 'place__geo__x__lt': subway.geo.x + 0.005, 'place__geo__y__gt': subway.geo.y - 0.005, 'place__geo__y__lt': subway.geo.y + 0.005} output = Event.objects.filter(**filter_parameters) for event in output: events.append(event) if films: filter_parameters['films__film__name__in'] = films output = Cinema.objects.filter(**filter_parameters) for cinema in output: if cinema not in cinemas: cinemas.append(cinema) if cinemas.__len__() == 0: output = Cinema.objects.filter(films__film__name__in=films) for cinema in output: if cinema not in cinemas: cinemas.append(cinema) if events and subjects: output = events events = [] for i in range(output.__len__() - 1, -1, -1): for subject in output[i].subjects.all(): if subject in subjects: events.append(output[i]) break elif subjects: for event in Event.objects.filter(subjects__in=subjects): events.append(event) places = [] for i in range(events.__len__() - 1, -1, -1): if events[i].place not in places: places.append(events[i].place) else: events.pop(i) t = loader.get_template('home.html') c = RequestContext(request, {'events': events, 'cinemas': cinemas}) return HttpResponse(t.render(c)) t = loader.get_template('home.html') c = RequestContext(request, {'objects': {}}) return HttpResponse(t.render(c))
initialDict() morph = get_morph('/home/bliq/PycharmProjects/Dictionary') # Подключаем словари morth2 = pymorphy2.MorphAnalyzer() searchWord = "" synonym = "" result = "" text = raw_input("Введите текст:\n") uni = unicode(text, "UTF-8") # Перевод строку в Unicode # Разбиваем исходную строку на подстроки listTokens = tokenizers.extract_tokens(uni) listWords = tokenizers.extract_words(uni) dic = {} for word in listWords: info = morph.normalize(word.upper()) info = list(info)[0] dic[info] = dic[info] + 1 if dic.has_key(info) else 1 dic = sorted(dic.items(), key = lambda elem: elem[1], reverse = True) for word in dic: info = morph.get_graminfo(word[0]) # if info[0]['class'] != u'СОЮЗ' \
def to_seq(input): words = list() for word in tokenizers.extract_words(input): words.append(word) stemmed = [RussianStemmer().stem(word) for word in words] return " ".join(stemmed)
def tokenizeText(text): return [word for word in tokenizers.extract_words(text)]
def song_list(request): start_time = time.time() query = request.GET.get('query','') if query: status, res = search(query.replace("L", "AND").replace(" X", " AND").replace("X", "").encode('utf-8')) if status == OK: song_list = Song.objects.filter(id__in=res) list_of_links = list() l_ew = list() ended_row_with_dependencies = False for i in extract_words(query.encode('utf-8').decode('utf-8').lstrip().rstrip()): if i not in ["AND", "OR", "NOT"] and "X" not in i and "L" not in i: l_ew.append(i) elif i == "NOT": ended_row_with_dependencies = True elif i == "AND" and not ended_row_with_dependencies: list_of_links.append(0) elif "X" in i and not ended_row_with_dependencies: list_of_links.append(len(i)) elif "L" in i and not ended_row_with_dependencies: list_of_links.append(100500) list_of_normalized_query_words = [0]*len(l_ew) for i, ew in enumerate(l_ew): list_of_normalized_query_words[i] = IndexElement.objects.filter(term__in=morph.normalize(ew.upper())).select_related("synonyms", "song") if list_of_normalized_query_words[i] is None: print "AAAAAAA" # Range here for s in song_list: l_of_clear_repeats = [0]*len(l_ew) l_of_normalized_repeats = [0]*len(l_ew) l_of_synonym_repeats = [0]*len(l_ew) l_of_tf_idfs = [0]*len(l_ew) l_of_positions = [set()]*len(l_ew) set_of_highlights = set() # TODO highlight it # clear repeats low = list(extract_words(s.lyrics)) i = 0 for i, w in enumerate(low): w = w.upper() for ii, ew in enumerate(l_ew): if ew.upper() == w: l_of_clear_repeats[ii] += 1 set_of_highlights.add(i) l_of_positions[ii].add(i) amount_of_words_in_song = i + 1 for ii, ew in enumerate(l_ew): # ((^|\|)[^|]*tr($|\|)) ss = list() syn_list = list() freq = 0 for nor in list_of_normalized_query_words[ii]: ss.append(nor.term) freq += nor.get_linked_songs_amount() for syn in nor.synonyms.all(): syn_list.append(syn.term) idf_of_ew = log((100000. - freq + 0.5)/(freq + 0.5)) # normalized repeats pat = re.compile("((^|\|)[^\|]*(" + "|".join(ss) + ")($|\|))") print "NORM", "|".join(ss) includes = re.findall(pat, s.map_of_normalized_words) for el in includes: seg, pos, word = el[0].strip("|").split(" ") if seg == '1': set_of_highlights.add(int(pos)) l_of_positions[ii].add(int(pos)) l_of_normalized_repeats[ii] += 1 # Synonyms repeats pat = re.compile("((^|\|)[^\|]*(" + "|".join(syn_list) + ")($|\|))") print "SL", "|".join(syn_list) includes = re.findall(pat, s.map_of_normalized_words) for el in includes: seg, pos, word = el[0].strip("|").split(" ") if seg == '1': set_of_highlights.add(int(pos)) l_of_positions[ii].add(int(pos)) l_of_synonym_repeats[ii] += 1 weighted_tf = (0.6)*l_of_clear_repeats[ii] + (0.3)*l_of_normalized_repeats[ii] + (0.1)*l_of_synonym_repeats[ii] l_of_tf_idfs[ii] = (weighted_tf / amount_of_words_in_song) * idf_of_ew range_by = sum(l_of_tf_idfs) # TODO # additional ranking all_pos = list() for brakes in itertools.product(*l_of_positions): flag = False for i in xrange(len(brakes)-1): if brakes[i] >= brakes[i+1]: flag = True break if not flag: all_pos.append(brakes) all_pos2 = list() # neighbors through AND for b in all_pos: ff = True for i in xrange(len(l_ew)-1): if list_of_links[i] == 0: if b[i+1] != b[i]+1: ff = False if ff: all_pos2.append(b) print "AND done", len(all_pos2) all_pos3 = list() # neighbors through XXX for b in all_pos2: ff = True for i in xrange(len(l_ew)-1): if 0 < list_of_links[i] < 100500: trashline = "" for ww in low[b[i] + 1: b[i+1]]: trashline += ww trashline = trashline.upper() am_of_gl = trashline.count("А".decode('utf-8')) + trashline.count("Е".decode('utf-8')) + \ trashline.count("О".decode('utf-8')) + trashline.count("У".decode('utf-8')) + \ trashline.count("Ы".decode('utf-8')) + trashline.count("Э".decode('utf-8')) + \ trashline.count("Я".decode('utf-8')) + trashline.count("И".decode('utf-8')) + \ trashline.count("Ю".decode('utf-8')) + trashline.count("Ё".decode('utf-8')) if am_of_gl != list_of_links[i]: ff = False if ff: all_pos3.append(b) for b in all_pos3: print "XXX done", b if len(all_pos3) > 0: range_by += 1.0 print set_of_highlights, l_of_clear_repeats, l_of_normalized_repeats, l_of_synonym_repeats, range_by count = song_list.count() else: return error(request, res) else: song_list = [] count = 0 params_dict = { 'song_list': song_list, 'results_count': count, 'elapsed_time': round_time(time.time() - start_time) } return render(request, 'song_list.html', params_dict)
def eval_features(self, number, clazz, text): features = {} words_count = 0 average_length = 0 latin_words = 0 words = list() if clazz in self.classes_count: self.classes_count[clazz] += 1 else: self.classes_count[clazz] = 0 for word in tokenizers.extract_words(text): words.append(word) words_count += 1 average_length += len(word) if check_latin_word(word): latin_words += 1 sentences = parser.analyze_paragraph(text) features['sentences_count'] = len(sentences) average_sentence_len = 0 for sentence in sentences: average_sentence_len += len(sentence) features['average_sentence_length'] = average_sentence_len / len(sentences) excl_marks = "!" in text features['words_count'] = words_count features['average_length'] = average_length / words_count features['class'] = clazz features['latin_words'] = latin_words features['excl_marks'] = excl_marks features['quotes_count'] = len(self.quotes_pattern.findall(text)) features['digits_count'] = len(self.digit_patter.findall(text)) # dict features features['auto_dict'] = self.count_dict_features("auto.txt", words) features['economics_dict'] = self.count_dict_features("economics.txt", words) features['hi_tech_dict'] = self.count_dict_features("hi_tech.txt", words) features['internet_dict'] = self.count_dict_features("internet.txt", words) features['kultura_dict'] = self.count_dict_features("kultura.txt", words) features['politics_dict'] = self.count_dict_features("politics.txt", words) features['science_dict'] = self.count_dict_features("science.txt", words) features['social_dict'] =self.count_dict_features("social.txt", words) features['sport_dict'] = self.count_dict_features("sport.txt", words) abbrevations_count = 0 for token in text.split(): if self.abbreviation_pattern.match(token): abbrevations_count += 1 # print token # else: # print "not match: ", token # features['abbrevation_count'] = abbrevations_count # if abbrevations_count>30: # print number, " ", abbrevations_count # print features self.files_features[number] = features
# coding=utf-8 __author__ = "artemii" from pymorphy import get_morph from pymorphy.contrib import tokenizers f = open("negative_words.txt", "r") resultFile = open("negative_words_normalized.txt", "a") morph = get_morph(".") # normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8")) # print normalized.pop().lower().encode("utf-8") for line in f: # word = raw_input() words = tokenizers.extract_words(line.decode("utf-8")) word = words.next() normalized = morph.normalize(word.upper()) resultFile.write(normalized.pop().lower().encode("utf-8") + "\n") # print normalized.pop().lower() # for word pairs # for line in f : ## word = raw_input() # words = tokenizers.extract_words(line.decode("utf-8")) # normalized_fst = morph.normalize(words.next().upper()) # normalized_snd = morph.normalize(words.next().upper()) # resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')