def convert_file(dev, file, _from, _to, out): dev_str = 'dev' if dev else 'prod' py_file = get_path(dev, 'py', file, out=out) lua_file = get_path(dev, 'lua', file, out=out) in_file = py_file if _from == 'py' else lua_file out_file = py_file if _to == 'py' else lua_file content = read(in_file) content = \ content.replace(before[_from].replace('{dev}', dev_str).strip(), '') for pattern, replace in regexps_before[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_DOTALL[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_MULTILINE[_from]: pattern = pattern.replace('{dev}', dev_str) replace = replace.replace('{dev}', dev_str) content = re.sub(pattern, replace, content, flags=re.MULTILINE) for i in range(5): for pattern, replace in regexps_DOTALL[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_after[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) content = \ f"{before[_to].replace('{dev}', dev_str)}\n\n" \ f"{content.strip()}\n".lstrip() write(out_file, content)
def generate_lists(): started = datetime.now() for title, page in storage.iterate_pages(silent=True): if ' ' in title: # пропускаем словосочетания continue if '{{гл ru' in page.ru.content: print(title) # вид: совершенный, несовершенный, неизвестный aspect = get_aspect(page) # ударение из шаблона {{по-слогам}} stress = get_stress(page) # группировка по окончаниям value = f"# [[{title}]] ({aspect}) {stress}\n" added = False for ending in endings: if title.endswith(ending): endings[ending].append((title, value)) added = True if not added: endings['?'].append((title, value)) print(datetime.now() - started) write('endings_ru_new.json', json.dumps(endings, indent=4))
def convert_file(filename, _from, _to): py_file = join(get_path('py'), f'{filename}.py') lua_file = join(get_path('lua'), f'{filename}.lua') in_file = py_file if _from == 'py' else lua_file out_file = py_file if _to == 'py' else lua_file content = read(in_file) content = content.replace(before[_from].strip(), '') for pattern, replace in regexps_before[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_DOTALL[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_MULTILINE[_from]: content = re.sub(pattern, replace, content, flags=re.MULTILINE) for i in range(5): for pattern, replace in regexps_DOTALL[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) for pattern, replace in regexps_after[_from]: content = re.sub(pattern, replace, content, flags=re.DOTALL) content = f"{before[_to]}\n\n{content.strip()}\n".lstrip() write(out_file, content)
def dump_stats(): print(dtf('Ymd')) for lang in langs: print('Dumping language:', lang) url = f'https://{lang}.wiktionary.org/wiki/Special:Statistics' r = requests.get(url) write(html_path(lang), r.content.decode())
def lock(self, lock_slug): lock_filename = self.lock_filename(lock_slug) if exists(lock_filename): raise StorageError(f"Can't lock: Storage is already locked: " f'"{lock_filename}"') write(lock_filename, dt()) self.locked = True
def block_path(self, title): category, name = char_info(title[0]) if category not in ['Ll', 'Lo', 'Lu', 'Pd']: name = 'OTHER' candidates = [ (name, join(self.handler.path, name)), ] path = candidates[-1][1] for i in range(MAX_DEPTH): code = ord(title[i]) if i < len(title) else 0 key = f'{code} - {hex(code)}' path = join(path, key) candidates.append((title[:i + 1], path)) for prefix, candidate in candidates: for attempt in range(3): if not exists(candidate): print(f"`write`: {candidate}") write(candidate, self.default_empty(prefix)) return candidate if is_locked(candidate): if attempt < 2: post_to_slack( 'recent-errors', f'Locked #{attempt}: {prefix}, {candidate}') time.sleep(1) continue raise LockedError(candidate) if isfile(candidate): return candidate raise BlockNotFound(f"Path does't exist for title '{title}'" ) # fixme: never should happen?
def wrapped(*args, **kwargs): lock_file = join(conf.root_path, 'sys', 'lock', slug) if exists(lock_file): print(dt(), f'Already locked: `{slug}`') return write(lock_file, '') try: return func(*args, **kwargs) finally: os.remove(lock_file)
def download_module(title, path): print(f'- {title}', end='') content = load_page(title) + '\n' content = content.replace("\n-- dev_prefix = 'User:Vitalik/'", "\ndev_prefix = 'User:Vitalik/'") outs = ['', '.out'] if not debug else [ '', ] for out in outs: write(path.replace('[.out]', out), content) print(' - OK')
def lock(self, lock_slug): lock_filename = self.lock_filename(lock_slug) if exists(lock_filename): storages = { '/home/vitalik/storages/wiktionary/storage' '/authors/sys/lock_recent': '*authors*', } storage_name = storages.get(lock_filename, lock_filename) post_to_slack( 'recent-errors', f':lock: `{Logger.slug}` ' f'Storage is locked: {storage_name}') raise StorageAlreadyLocked( f'Can\'t lock: Storage is already locked: "{lock_filename}"') write(lock_filename, dt()) self.locked = True
def download_page(title, path): print(f'- {title}', end='') try: content = load_page(title) + '\n' except NoPage: print(' - No page') return content = content.replace("\n-- dev_prefix = 'User:Vitalik/'", "\ndev_prefix = 'User:Vitalik/'") if exists(path): old_content = read(path) if old_content != content: print(' - OK') else: write(path, content) print(' - Not changed') else: write(path, content) print(' - NEW')
def update_langs(): all_articles = defaultdict(list) without_redirects = defaultdict(list) # for title, page in storage.iterate_pages_with_info(silent=True): for title, page in storage.iterate_pages(silent=True): for lang in page.languages.keys: all_articles[lang].append(title) # if not page.is_redirect: if title not in storage.redirects_set: without_redirects[lang].append(title) path = join(conf.PARSED_STORAGE_PATH, 'lists', 'langs') for lang, titles in all_articles.items(): write(f'{path}/{lang or "-"}.txt', '\n'.join(titles)) path = join(conf.PARSED_STORAGE_PATH, 'lists', 'langs', 'articles') for lang, titles in without_redirects.items(): write(f'{path}/{lang or "-"}.txt', '\n'.join(titles)) print('ok')
def convert_dir(dev, _from, _to): if not compare_dir(dev, _to): print(f'Ошибка: папки `{_to}` не синхронизированы до конвертации.') return for file in declension_files: # Прямое преобразование: convert_file(dev, file, _from, _to, out=False) # Копирование результата в `ru.out`: in_file = get_path(dev, _to, file, out=False) out_file = get_path(dev, _to, file, out=True) write(out_file, '') # чтобы создать папки, если их нет copy(in_file, out_file) # Обратное преобразование для .out: convert_file(dev, file, _to, _from, out=True) if not compare_dir(dev, _from): print( f'Ошибка: папки `{_from}` не синхронизированы после конвертации.') return
def save(self): copy(self.path, f'{self.path}.bak') write(self.path, SEPARATOR.join(self.contents)) super(ContentsBlockHandler, self).save()
def latest_updated(self, value): write(self.latest_updated_filename, dt(value, utc=True))
def save(self): copy(self.path, f'{self.path}.bak') write(self.path, '\n'.join(self.contents)) super(SimpleBlockHandler, self).save()
def set(cls, value): write(cls.filename, value.strftime('%Y-%m-%d %H:%M:%S'))
def sync_save(title, content): path = sync_path(title) write(path, content)
def save_data(self, path, prefix, titles): titles.sort() titles_str = '\n'.join(titles) contents = [f"Prefix: {prefix}\n{titles_str}"] contents += [self.data(title) for title in titles] write(path, SEPARATOR.join(contents))
def save_data(self, path, prefix, titles): lines = [f'{title}\t{self.data(title)}' for title in sorted(titles)] write(path, '\n'.join(lines))
def all_pages_start_from(self, value): write(self.all_pages_start_from_filename, value)
def save_articles(self, articles): write(self.articles_filename, '\n'.join(articles))
def save_redirects(self, redirects): write(self.redirects_filename, '\n'.join(redirects))
def save(cls): content = cls.sep.join(cls.data) write(cls.history_filename, content) write(cls.active_filename, content)
def create_fs(self): self.create_dir(self.path, self.structure, level=1) write(join(self.path, '_sys', 'max_count'), str(self.max_count))