def before(self): super(A, self).before() words = load_lines(filename, decode=u'utf-8') self.words = dict() self.result = '' for word in words: self.words[word] = True
def __init__(self): super(A, self).__init__() self.message_timeout = 100000 self.index_words = load_lines(join(settings.DATA_PATH, 'wikt_words', 'all.txt')) for i in range(len(self.index_words)): # print repr(self.index_words[i]), repr(self.index_words[i].decode('utf-8')) self.index_words[i] = self.index_words[i].decode('utf-8')
def load_basewords(self): filename = join(settings.ROOT_PATH, 'wikt', 'data', 'input', 'baseforms_a.txt') lines = load_lines(filename) lines = filter(len, lines) lines = map(lambda x: x[4:-2], lines) # for line in lines: # print line return lines#[:30]
def load_words(): words = load_lines("wiktionary.txt") pages = [] m = 0 for word in words: pages.append(Page(title=word)) if len(word) > m: m = len(word) print m Page.objects.bulk(pages, Page, 1000)
def read_titles(filename): lines = load_lines(filename, "utf-8") base_titles = list() sub_titles = list() for line in lines: if line.startswith("< "): base_titles.append(line[2:]) if line.startswith("> "): sub_titles.append(line[2:]) return base_titles, sub_titles
def load_words_from_file(): words = load_lines(ru_index_filename) data = dict() for word in words: word = word.decode('utf-8') if word[0] == '-' or word[-1] == '-' or ' ' in word: continue if word[-1] == '.' or word.upper() == word: continue key = word[::-1] data[key] = word return data
def generate_red_links_index(): ignore_words_content = get_wiki_page_content(u'Участник:Vitalik/Индекс/Красные ссылки/Игнорируемые слова') ignore_words = list() for line in ignore_words_content.split('\n'): m = re.match('^\* \[\[(.*)\]\]$', line) if not m: print u'ERROR in ignore_words: %s' % line ignore_words.append(m.group(1).encode('utf8')) # print '\n'.join(ignore_words) # exit() page_names = [ u'Участник:Vitalik/Индекс/Красные ссылки/Дополнительные источники/Cinematique', u'Участник:Vitalik/Индекс/Красные ссылки/Дополнительные источники/Cinematique/Недостающие глаголы из причастий', ] cin_words = list() for page_name in page_names: cin_words_content = get_wiki_page_content(page_name) for line in cin_words_content.split('\n'): m = re.match('^[*#] \[\[(.*)\]\]$', line) if not m: print u'ERROR in cin_words: %s' % line cin_words.append(m.group(1).encode('utf8')) # print '\n'.join(cin_words) # exit() index_words = load_lines(join(settings.DATA_PATH, 'wikt_words', 'ru+redirects.txt')) dict_words = load_from_dictionaries() red_words = list((set(dict_words) | set(cin_words)) - set(index_words) - set(ignore_words)) print "Red count: ", len(red_words) # exit() # bos_words = load_from_dictionaries(['bos_barhudarov_filtering_words.txt']) # new_words = list(set(bos_words) - set(red_words) - set(index_words)) # for word in sorted(new_words): # print word # exit() # save_lines(join(settings.DATA_PATH, 'words_red_a.txt'), red_words) save_lines(join(settings.FILES_PATH, 'reports', 'red_links_ru'), sorted(red_words)) # create_index(red_words, u'Индекс/Красные ссылки (без подстраниц)', create_index(red_words, u'Индекс/Красные ссылки', desc=u'Обновление списка красных ссылок', push=True, debug=False, header=u'Красные ссылки', )
def a(self): wiktionary_nouns = {} for noun in load_lines('wiktionary_nouns.txt'): wiktionary_nouns[noun.decode('utf-8')] = 1 # print repr(noun.decode('utf-8')) i = 0 j = 0 for item in MyStemNoun.objects.all(): j += 1 value = item.value # print value # print repr(value) if value not in wiktionary_nouns: i += 1 print i, value item.not_wikt = True item.save() print i, j, len(wiktionary_nouns)
def process_names(): lines = load_lines(join(settings.DATA_PATH, 'input', 'cin_names.txt')) data = dict() for line in lines: line = line.decode('utf-8').strip() # print line name, gender = line.split(',') if name[0] != '"' or name[-1] != '"' or gender[0] != '"' or gender[-1] != '"': print line # print name, gender # print name[0], name[-1], gender[0], gender[-1] print '#' * 100 name = name[1:-1].strip() gender = gender[1:-1] if gender not in 'mf': print line print '#' * 100 data.setdefault(name, dict()) data[name][gender] = 1 if not re.match(u'^[-А-ЯЁа-яё]+$', name, re.IGNORECASE): print name print repr(name) m_list = list() f_list = list() mf_list = list() for name, genders in data.items(): m = genders.get('m') f = genders.get('f') if m and f: mf_list.append(name) elif m: m_list.append(name) elif f: f_list.append(name) else: print name save_lines(join(settings.DATA_PATH, 'input', 'cin_names_m.txt'), sorted(m_list), encode='utf-8') save_lines(join(settings.DATA_PATH, 'input', 'cin_names_f.txt'), sorted(f_list), encode='utf-8') save_lines(join(settings.DATA_PATH, 'input', 'cin_names_mf.txt'), sorted(mf_list), encode='utf-8')
def before(self): super(FindSomething, self).before() self.index_words = load_lines(join(settings.DATA_PATH, 'wikt_words', 'ru+redirects.txt'))
# print content # print words.append(page.title) return words filename = 'data/words_no_length.txt' #save_lines('data/test.txt', ['123', '456', '789']) #words = words_no_length() #save_lines(filename, words, encode='utf-8') def is_error(word): return word.find(' ') == -1 and word.find('.') == -1 def is_ok(word): return not is_error(word) words = load_lines(filename) words_no_template = filter(is_error, words) # for word in words_no_template[:100]: # print word create_index(words_no_template, u'Ошибки/длина слова/отсутствует', u'Формирование списока слов с отсутсвующим шаблоном "длина слова"', use_other=False) words_okay = filter(is_error, words) # for word in words_okay[:100]: # print word create_index(words_okay, u'Ошибки/длина слова/не нужна', u'Формирование списока слов, для которых не нужен шаблон "длина слова"', use_other=False)
# coding: utf-8 from os.path import join from django.conf import settings from dictionaries.utils.file import load_lines, save_lines from reports.red_links.create_index import \ create_index from reports.red_links.load_from_dictionaries import \ load_from_dictionaries index_words = load_lines(join(settings.DATA_PATH, 'wikt_words', 'ru+redirects.txt')) bukchina_words = load_from_dictionaries(['rus_orthography_academic.txt']) print len(bukchina_words) bukchina_red_words = list(set(bukchina_words) - set(index_words)) print len(bukchina_red_words) bukchina_red_words_a = filter(lambda x: x.decode('utf-8').lower()[0] == u'а',# or x[0] == u'А', bukchina_red_words) print len(bukchina_red_words_a) save_lines(join(settings.DATA_PATH, 'words_red_bukchina_a.txt'), bukchina_red_words_a) create_index(bukchina_red_words_a, u'Индекс/Красные ссылки/Букчина', desc=u'Создание списка красных ссылок Букчиной', max_words_on_page=100000, use_other=False, force_letters=u'А', push=True )
for letter, pages in builder.separate_pages.items(): for page in sorted(pages, key=lambda x: len(x)): print page if not use_other and page[0] == '-': continue content = builder.get(page) # page = ur(page) if debug: debug_write(file_path, page, content) if push: if force_wiki_title: wiki_title = force_wiki_title else: wiki_title = "%s/%s" % (wiki_prefix, path) wiki_title = "%s/%s" % (wiki_title, page) save_wiki_page(wiki_title, content, desc) # sleep(1) index_words = load_lines(bg_index_filename) category_words = load_lines(bg_wikt_filename) data = dict() for word in set(index_words) | set(category_words): uword = word.decode('utf-8') data[uword] = word in category_words create_index(data, u"Индекс/Болгарский язык", u"Различные доработки", alphabet=u"АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЭЮЯ", push=True)
def save_indexes(): # old_levels = [ # u'I', # u'II++', # u'II+', # u'II', # u'III++', # u'III+', # u'III', # u'IV++', # u'IV+', # u'IV', # ] # for level in levels_order: # save_wiki_page(u'Участник:Vitalik/Индекс/Качество/%s уровень' % level, # u'#redirect [[Участник:Vitalik/Индекс/Качество/%s уровень/Итого]]' % level, # u'Создание редиректа для удобного переименования') # for level in levels_order: # save_wiki_page(u'Викисловарь:Проект:Русский язык/Качество/%s' % level, # u'#redirect [[Викисловарь:Проект:Русский язык/Качество/%s/Итого]]' % level, # u'Создание редиректа для удобного переименования') # move_wiki_page(u'Викисловарь:Проект:Русский язык/Качество/%s' % level, # u'Викисловарь:Проект:Русский язык/Качество/%s/Итого' % level, # u'Создание редиректа для удобного переименования') # sys.exit() # − print dt(), 'started getting checkers' warning, errors = get_checkers() print dt(), 'finished getting checkers' warning_reports = get_reports_links(warning) errors_reports = get_reports_links(errors) counts = dict() counts1 = dict() counts2 = dict() counts3 = dict() counts4 = dict() highest = load_lines(settings.FILES_PATH + '/data/frequency/ru_new_5000', decode=u'utf-8') top1 = load_lines(settings.FILES_PATH + '/data/frequency/ru_old_1-1000', decode=u'utf-8') top2 = load_lines(settings.FILES_PATH + '/data/frequency/ru_old_1001-10000', decode=u'utf-8') medium = load_lines(settings.FILES_PATH + '/data/frequency/ru_mas_xr_0_2b', decode=u'utf-8') # high2 = set(top1) | (set(top2) - set(highest)) # high1 = set(top1) | set(top2) - set(highest) # print len(high1), len(high2) # if high1 != high2: # raise Exception('!!!') # print len(highest) # print len(set(top1) | set(top2)) high = sorted(list((set(top1) | set(top2)) - set(highest))) # print len(high) # print len(medium) medium = sorted(list(set(medium) - set(highest) - set(high))) # print len(medium) # sys.exit(1) # top1_lines = load_lines(settings.FILES_PATH + 'data/frequency/ru_1-1000') # top2_lines = load_lines(settings.FILES_PATH + 'data/frequency/ru_1001-10000') # top1 = dict() # top2 = dict() # for line in top1_lines: # top1[line] = True # for line in top2_lines: # top2[line] = True total = 0 total1 = 0 total2 = 0 total3 = 0 total4 = 0 path = settings.FILES_PATH + '/reports/quality/' for level in levels_order: words = load_lines(path + 'level_%s.txt' % level.replace(u'−', u'-'), decode=u'utf-8') words1 = sorted(list(set(words) & set(highest))) words2 = sorted(list(set(words) & set(high))) words3 = sorted(list(set(words) & set(medium))) counts[level] = len(words) counts1[level] = len(words1) counts2[level] = len(words2) counts3[level] = len(words3) counts4[level] = counts[level] - counts1[level] - counts2[level] - counts3[level] total += counts[level] total1 += counts1[level] total2 += counts2[level] total3 += counts3[level] total4 += counts4[level] #if level == u'II': # continue if level[0] == u'D': words_details = warning_reports elif level[0] == u'E': words_details = errors_reports else: words_details = None if level[0] in [u'D', u'E']: content1 = '\n'.join([u"* [[%s]]\n%s\n" % (word, words_details.get(word, u"* ''(пусто)''")) for word in words1]) content2 = '\n'.join([u"* [[%s]]\n%s\n" % (word, words_details.get(word, u"* ''(пусто)''")) for word in words2]) else: content1 = '\n'.join([u"# [[%s]]" % word for word in words1]) content2 = '\n'.join([u"# [[%s]]" % word for word in words2]) content1 = u"<noinclude>\n%s\n</noinclude><includeonly>%s</includeonly>" % \ (content1, counts1[level]) content2 = u"<noinclude>\n%s\n</noinclude><includeonly>%s</includeonly>" % \ (content2, counts2[level]) content4 = u'%s' % counts4[level] importance = ( (u'Высшая', content1), (u'Высокая', content2), # (u'Средняя', content3), (u'Низкая', content4), ) for name, content in importance: # print '-' * 100 # print u'Викисловарь:Проект:Русский язык/Качество/Уровень %s/%s важность' % (level, name) # print '-' * 100 # print content # print '-' * 100 # print save_wiki_page( u'Викисловарь:Проект:Русский язык/Качество/Уровень %s/%s важность' % (level, name), content, u'Обновление списка слов по уровню качества и важности') # continue create_index( words3, u'Качество/Уровень %s/%s важность' % (level, u'Средняя'), desc=u'Обновление списка слов по уровню качества и важности', push=True, debug=False, # push=False, debug=True, header=u'Уровень качества %s' % level, force_wiki_prefix=u'Викисловарь:Проект:Русский язык', # wiki_save_only_total=True, need_div=False, words_details=words_details, ) create_index( words, u'Качество/Уровень %s' % level, desc=u'Обновление списка слов по уровню качества', push=True, debug=False, # push=False, debug=True, header=u'Уровень качества %s' % level, force_wiki_prefix=u'Викисловарь:Проект:Русский язык', # wiki_save_only_total=True, need_div=False, words_details=words_details, ) content = u""" {| class="wikitable" style="text-align: center" ! colspan="7" | Статьи проекта «Русский язык» |- ! rowspan="2" | Уровень <br />качества ! colspan="6" | Важность |- ! Высшая ! Высокая ! Средняя ! Низкая ! Всего |- """ for level in levels_order: content += u"| '''%s''' \n" \ u"| %s <!-- Уровень %s, высшая важность -->\n" \ u"| %s <!-- Уровень %s, высокая важность -->\n" \ u"| %s <!-- Уровень %s, средняя важность -->\n" \ u"| %s <!-- Уровень %s, низкая важность -->\n" \ u"| %s <!-- Уровень %s, всего -->\n" \ u"|- \n" % \ (level, counts1[level], level, counts2[level], level, counts3[level], level, counts4[level], level, counts[level], level) content += u"| '''Всего''' \n" \ u"| %s <!-- Всего, высшая важность -->\n" \ u"| %s <!-- Всего, высокая важность -->\n" \ u"| %s <!-- Всего, средняя важность -->\n" \ u"| %s <!-- Всего, низкая важность-->\n" \ u"| %s <!-- Всего -->\n" \ u"|-\n" % \ (total1, total2, total3, total4, total) content += u"|}" # print content save_wiki_page(u'Викисловарь:Проект:Русский язык/Качество/Для истории', content, u'Статистика по качеству статей (для истории)')
def load_lines(self): filename = join(settings.DATA_PATH, 'input', 'muller_utf8.txt') return load_lines(filename)
'bos_barhudarov_filtering_words.txt') # 'bos_barhudarov_filtering_names.txt') # lines = load_lines(filename, decode='utf-8') # prev_prev = None # prev_line = None # for line in lines: # # if ',' in line: # # print line # # print prev_line, u'—', line # if prev_line and line[0] != prev_line[0]: #\ # #and ord(line[0]) - ord(prev_line[0]) == 1: # # if prev_line[-1] in " -": # print u'→', prev_line # print u'→', line # print # # continue # prev_line = line lines = load_lines(filename, decode='utf-8') for line in lines: # if re.search(u'[a-z]', line): # print line if ' ' in line: print line # lines = map(lambda x: x.strip(), lines) # lines = map(lambda x: x.replace('c', u'с').replace('p', u'р').replace('o', u'о'), lines) # save_lines(filename, sorted(set(lines)), encode='utf-8') # print repr(u'= {{-eo-}} =')
def load_from_dictionaries(force_files=None): files = [ u'dmitriev_academic.txt', u'efremova_speakrus.txt', u'lopatin_academic.txt', u'lopatin_speakrus.txt', u'mas_academic.txt', # u'rus_orthography_academic.txt', u'zaliznyak_speakrus.txt', u'ushakov.txt', u'bos_barhudarov_filtering_words.txt' ] if force_files: files = force_files words = [] for filename in files: words += load_lines(join(settings.DATA_PATH, 'dictionaries/fixes', filename)) words = filter(lambda x: not x.startswith(' '), words) fixes_problems = load_lines( join(settings.DATA_PATH, 'dictionaries', 'fixes.problems', 'comma_filtered_fixed.txt')) bad_words = list() divide_words = list() for line in fixes_problems: if line.startswith(' '): # print line[1:] bad_words.append(line[1:]) pass elif line.startswith('/'): divide_words.append(line[1:]) # print line[1:] # else: # print line words = list(set(words) - set(bad_words)) for word in words: if word in divide_words: divided_items = word.split(', ') for divided_item in divided_items: words.append(divided_item) if word.endswith('…') and ' ' not in word: if word.endswith('-…'): replaced_word = word.replace('…', '') else: replaced_word = word.replace('…', '-') words.remove(word) words.append(replaced_word) # print word # print replaced_word # print if word.endswith('...') and ' ' not in word: if word.endswith('-...'): replaced_word = word.replace('...', '') else: replaced_word = word.replace('...', '-') words.remove(word) words.append(replaced_word) # print word # print replaced_word # print if word.endswith(' -') and ' ' not in word[:-2]: replaced_word = word.replace(' -', '-') words.remove(word) words.append(replaced_word) # print word # print replaced_word # print words = list(set(words) - set(divide_words)) for word in words: if u'- и ' in word.decode('utf-8') or u'- или ' in word.decode('utf-8'): words.remove(word) return words
from dictionaries.utils.file import load_lines, save_lines lines1 = open('1b.txt').read().split('\n\n') lines2 = load_lines('2.txt') lines3 = [] i = 0 for item in lines1: i += 1 # if i > 1000: # break title = item.split('\n')[0] print title if title in lines2: lines3.append(item) print 'ok' with open('3.txt', 'w') as f: f.write('\n\n'.join(lines3))
# coding: utf-8 import os # from django.conf import settings import re from dictionaries.utils.file import load_lines FILES_PATH = '../../../files' # filename = os.path.join(settings.FILES_PATH, 'data/tihonov_verD.txt') filename = os.path.join(FILES_PATH, 'data/Lopatin2_utf8.dsl') with open(filename) as f: items = f.read().split('\n\t\n') print len(items) no_transcription = load_lines(FILES_PATH + '/reports/no_transcription_ru', decode=u'utf-8') no_transcription_dict = dict() for word in no_transcription: no_transcription_dict[word] = True output = '' # for item in items[:1000]: # for item in items[-2:]: i = 0 for item in items: i += 1 # if i > 1000: break # print item # lines = item.split('\r\n')
def process_sootv(): filename = os.path.join(settings.ROOT_PATH, 'wikt', 'data', 'sootv.txt') lines = load_lines(filename) p1 = re.compile(u"^[-а-яё́]+$", re.UNICODE | re.IGNORECASE) p2 = re.compile(u"^\[\[[- I#|а-яё́]+\]\]$", re.UNICODE | re.IGNORECASE) without = list() single = list() multi = list() other = list() prefix_case = list() for line in lines: m = re.search(u"\* '''(?P<title>[^']+)''' — (?P<sootv>.+)", line.decode('utf8'), re.UNICODE) # if not m: # print line if m: title = m.group('title') sootv = m.group('sootv') if sootv in ['-']: without.append((title, sootv)) continue if p1.match(sootv): single.append((title, sootv)) continue if p2.match(sootv): single.append((title, sootv)) continue prefixes = [u'приставочного типа ', u'приставочные вида ', u'приставочные типа ', u'приставочные: ', u'приставочный вида ', u'приставочного типа: ', u'<span>приставочные типа</span> ', u'приставочные глаголы типа ', u'приставочные вроде ', u'приставочные ',] ok = False for prefix in prefixes: if sootv.startswith(prefix): prefix_case.append((title, sootv)) ok = True break if ok: continue ok = True for item in sootv.split(', '): # if p1.match(item): # continue if not p2.match(item): ok = False if ok: multi.append((title, sootv)) else: other.append((title, sootv)) print u'\n== Приставочные глаголы типа ==' print u"{{кол|3}}" for item in prefix_case: print u"* '''[[%s]]''' — %s" % (item[0], item[1]) print u"{{конец кол}}" print u'\n== Специфические случаи ==' print u"{{кол|3}}" for item in other: link = 'https://ru.wiktionary.org/w/index.php?title=%s&action=edit§ion=2' % \ urllib.quote_plus(item[0].encode('utf8')) print u"* '''[[%s]]''' — %s" % (item[0], item[1]) #, u'\t\t\t\t\t—', link # print repr(item[1]) # print print u"{{конец кол}}" print u'\n== Соответствующего глагола нет (значение "-") ==' print u"{{кол|3}}" for item in without: print u"* '''[[%s]]''' — %s" % (item[0], u"''нет''") print u"{{конец кол}}" print u'\n== Классический вариант нескольких соответствующих глаголов ==' print u"{{кол|3}}" for item in multi: print u"* '''[[%s]]''' — %s" % (item[0], item[1]) print u"{{конец кол}}" print u'\n== Одиночный соответствующий глагол ==' print u"{{кол|3}}" for item in single: print u"* '''[[%s]]''' — %s" % (item[0], item[1]) print u"{{конец кол}}"
def load_dictionary_words(name, source): filename = "%s_%s.txt" % (name, source) path = join(DICTIONARIES_DIR, filename) return load_lines(path)
# coding: utf-8 from dictionaries.utils.file import load_lines from dictionaries.wiktionary_package.create_index_utils import create_index from wikt.tasks.index.get_words import sl_db_filename, sl_index_filename words = load_lines(sl_db_filename) words += load_lines(sl_index_filename) create_index(words, u'Индекс/Словенский язык', u'Формирование индекса слов для словенского языка', alphabet=u'ABCČDEFGHIJKLMNOPRSŠTUVZŽQWXY', #push=True, force_wiki_title=u"Индекс:Словенский_язык")
def load_dictionaries_words(name): filename = "%s.txt" % name path = join(DICTIONARIES_DIR, filename) return load_lines(path)