예제 #1
0
    def section_action(self, page, lang, section_content):
        # if self.i > 10000: self.stop()
        super(WithoutMining, self).section_action(page, lang, section_content)
        title = page.title
        if ':' in title:
            return
        if re.search(u'=== *Семантические свойства *===', section_content, re.UNICODE) \
                and not re.search(u'====? *Значение *====?', section_content, re.UNICODE):
            # print '=' * 100
            # print page.title, lang
            # print '-' * 100
            # print section_content
            # print '-' * 100
            if u'# {{значение\n' in section_content:
                key = u'Шаблон "значение"'
            else:
                key = u'Возможно ошибка'

                # Временный фикс:
                # new_section_content = re.sub(
                #     u'=== *Семантические свойства *===\n',
                #     u'=== Семантические свойства ===\n==== Значение ====\n',
                #     section_content)
                # old_content = get_wiki_page_content(page.title)
                # new_content = old_content.replace(section_content,
                #                                   new_section_content)
                # if new_content != old_content:
                #     desc = u'Добавление недостающего заголовка'
                #     save_wiki_page(title, new_content, desc, wait=5)

            append_dict_list(self.without_mining[key], title, lang)
예제 #2
0
파일: unknown.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     # if self.i > 10000: self.stop()
     super(TemplatesMining, self).lang_action(page, lang, content)
     title = page.title
     if title.startswith(u'Шаблон:'):
         return
     sections = P.section_mining.findall(content)
     for section in sections:
         items = P.template_any.findall(section[1])
         for item in items:
             tpl = item[0]
             values = item[1].split('|')
             name = values[0]
             filtered = G.mining.templates.all + G.mining.labels.all
             # if name not in filtered and name.strip() in filtered \
             #         and name.strip() != u'семантика':
             #     append_dict_list(
             #         self.mining_templates[u'Пробелы в названии шаблона'],
             #         name, (title, tpl))
             name = name.strip()
             if name not in filtered:
                 # self.counts.setdefault(name, 0)
                 # self.counts[name] += 1
                 append_dict_list(
                     self.mining_templates[u'Шаблоны в "Значение"'],
                     name, (title, tpl))
예제 #3
0
    def content_action(self, page, content, redirect, **kwargs):
        title = page.title
        p = re.compile(
            u"""(\{\{
                (?P<title>морфо[^|]*)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE)
        parts = p.findall(content)
        for part in parts:
            params = part[2]
            # try:
            #     print params
            # except UnicodeEncodeError:
            #     # print '\n', '#' * 100, '\n', repr(params), '\n', '#' * 100
            #     print '###', repr(params)

            values = part[2].strip().split('|')
            for value in values:
                value = value.strip()
                if '=' in value:
                    param, value = value.split('=', 1)
                # try:
                #     print value
                # except UnicodeEncodeError:
                #     print '#', repr(value)
                if '-' in value:
                    # try:
                    #     print params
                    # except UnicodeEncodeError:
                    #     print '###', repr(params)
                    append_dict_list(self.morpho_wrong_hyphens_2, title, part[0])
                    break
예제 #4
0
 def section_action(self, page, lang, section_content):
     super(WrongStress, self).section_action(page, lang, section_content)
     title = page.title
     if lang != 'ru':
         return
     m = re.search(u'([бвгдйклмнпрстфхцчшщьъ]́)', section_content, re.UNICODE)
     if m:
         append_dict_list(self.wrong_stress, title, m.group(1))
예제 #5
0
 def section_action(self, page, lang, section_content):
     super(WrongHeaderCollocation, self).section_action(page, lang, section_content)
     title = page.title
     if ' ' not in title:
         return
     m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if m:
         append_dict_list(self.wrong_header_collocation, title, '')
예제 #6
0
 def section_action(self, page, lang, section_content):
     super(WrongHeaderWord, self).section_action(page, lang, section_content)
     title = page.title
     if ' ' in title:
         return
     m = re.search(u'(=== *Тип и синтаксические свойства сочетания *=== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if m:
         append_dict_list(self.wrong_header_word, title, '')
예제 #7
0
파일: only_one.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     super(SecondLevelOnlyOne, self).lang_action(page, lang, content)
     title = page.title
     if ":" in title:
         return
     if title == u"Заглавная страница":
         return
     items = re.findall("^==[^=].+== *$", content, flags=re.MULTILINE)
     if len(items) == 1:
         header = items[0]
         append_dict_list(self.second_level_only_one, title, (lang, header))
예제 #8
0
 def header_action(self, page, before, header, after):
     super(SyntaxErrors, self).header_action(page, before, header, after)
     title = page.title
     if ':' in title:
         return
     if title == u"Заглавная страница":  # todo: move it to HeadersIterator
         return
     full_header = self.get_header(before, header, after)
     if before != after:
         append_dict_list(self.syntax_errors, title, full_header)
         if settings.ALLOW_CYR_PRINT:
             print '#syntax_errors:', title, u'—', full_header
예제 #9
0
 def section_action(self, page, lang, section_content):
     super(MiningWrongFigureBrackets, self).section_action(page, lang, section_content)
     title = page.title
     m = re.search(u'(==== *Значение *==== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if not m:
         return
     # if self.i > 20000:
     #     self.stop()
     body = m.group(2)
     if len(re.findall('\{', body)) != len(re.findall('\}', body)):
         # print u(title)
         append_dict_list(self.wront_figure_brackets, title, body)
예제 #10
0
 def section_action(self, page, lang, section_content):
     super(WithoutAbbrevRu, self).section_action(page, lang, section_content)
     title = page.title
     if lang != 'ru':
         return
     m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if not m:
         return
     if title != title.upper():
         return
     body = m.group(2)
     if u'{{abbrev' not in body and u'{{сокращ' not in body:
         append_dict_list(self.without_abbrev, title, '')
예제 #11
0
 def section_action(self, page, lang, section_content):
     super(ContentMixedCyrlLatn, self).section_action(page, lang, section_content)
     title = page.title
     # if self.i > 3000:
     #     self.stop()
     results = P.mixed_cyrl_latn.findall(section_content)
     if not results:
         return
     for found in results:
         values = re.findall(u'\s\S*%s\S*\s' % found, section_content)
         # print title.encode('utf-8'), found.encode('utf-8')
         for value in values:
             # if u'{{морфо' in value and (u'j' in found or u'ʲ' in found):
             #     continue
             if u'j' in found or u'ʲ' in found:
                 continue
             value = value.strip()
             if re.search(u'.*[a-fX]СВ.*', value):
                 continue
             # print title.encode('utf-8'), value.encode('utf-8')
             # print
             if value.startswith('|'):
                 append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "верт. черта"'], title, value)
             elif value.startswith('['):
                 append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "квадр. скобка"'], title, value)
             elif value.startswith('{'):
                 append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "фигурн. скобка"'], title, value)
             else:
                 append_dict_list(self.content_mixed_cyrl_latn[u'Остальные результаты'], title, value)
예제 #12
0
    def header_action(self, page, before, header, after):
        super(WrongLevel, self).header_action(page, before, header, after)
        title = page.title
        if ":" in title:
            return
        if title == u"Заглавная страница":  # todo: move it to HeadersIterator
            return
        full_header = self.get_header(before, header, after)
        level = len(before)
        header = header.strip()
        correct_level = header_levels.get(header, 0)
        if correct_level:
            correct_levels = [correct_level]
        else:
            correct_levels = []

        has_affix = False
        for s in [u"{{suffix ", u"{{morph", u"Словообразовательная единица"]:
            if s in page.content:
                has_affix = True
                break

        if title.startswith("-") or title.endswith("-"):
            # это точно словообразовательная единица
            if header in [u"Значение", u"Антонимы", u"Синонимы"]:
                correct_levels = [3, 4]
        elif title.startswith("*"):
            # это точно праязык
            if header == u"Значение":
                correct_levels = [3, 4]
        elif has_affix:
            if header in [u"Значение", u"Антонимы", u"Синонимы"]:
                correct_levels = [3, 4]
        elif len(title) == 1:
            if header in [u"Значение"]:
                correct_levels = [3, 4]
        if correct_levels and level > 1:
            if level not in correct_levels:
                # skips = [
                #     # u'= {{-Hani-}} =', u'= {{-hani-}} =', u'= {{-hanzi-',
                #     # u'= {{-mul-}} =', u'= {{-INT-}} =', u'= {{-Zmth-}} =',
                #     # u'{{suffix ', u'{{morph', u'== Буква ==',
                #     u'Словообразовательная единица',
                # ]
                # for skip in skips:
                #     if skip in page.content:
                #         return
                append_dict_list(self.wrong_level, title, full_header)
                if settings.ALLOW_CYR_PRINT:
                    print "#wrong_level:", title, u"—", full_header
예제 #13
0
파일: other.py 프로젝트: 2vitalik/words
 def mining_line_action(self, title, lang, line):
     super(MiningOther, self).mining_line_action(title, lang, line)
     if ignore_start(line) or ignore_italic(line):
         return
     if "{" in line or line.startswith("|"):
         return
     if re.match(u"^\[\[(Файл:|File:|Image:|Изображение:).+\]\]$", line, re.UNICODE):
         key = u"Файл или Image"
     elif line == "[[]]":
         key = u"только пустые квадратные скобки"
     elif "[" in line:
         key = u"квадратные скобки"
     else:
         key = u"остальное"
     append_dict_list(self.mining_other, key, (title, lang, line))
예제 #14
0
 def section_action(self, page, lang, section_content):
     super(WithoutMorphoHeaderRu, self).section_action(page, lang, section_content)
     title = page.title
     if lang != 'ru':
         return
     if ' ' in title or title.startswith('-') or title.endswith('-') or title.startswith('*'):
         return
     if u'{{Форма-' in section_content:
         return
     if not re.search('===', section_content):
         return
     m = re.search(u'=== *Морфологические и синтаксические свойства *===',
                   section_content, re.UNICODE | re.DOTALL)
     if not m:
         append_dict_list(self.without_morpho_header, title, '')
예제 #15
0
파일: wrong.py 프로젝트: 2vitalik/words
 def header_action(self, page, before, header, after):
     super(SecondLevelWrong, self).header_action(page, before, header, after)
     title = page.title
     if ':' in title:
         return
     if title == u"Заглавная страница":
         return
     full_header = self.get_header(before, header, after)
     header = header.strip()
     if before == '==' and after == '==':
         p = re.compile(u'^\{\{(з|заголовок)[^}]*\}\}$', re.UNICODE)
         m = p.match(header)
         if not m:
             if settings.ALLOW_CYR_PRINT:
                 print '#second_level_wrong:', title, u'—', full_header
             append_dict_list(self.second_level_wrong, title, full_header)
예제 #16
0
    def section_action(self, page, lang, section_content):
        super(HeadersDuplicates, self).section_action(page, lang,
                                                      section_content)
        title = page.title
        if ':' in title:
            return

        for header, level in header_levels.items():
            found = re.findall(
                u'{} *{} *{}'.format('=' * level, header, '=' * level),
                section_content
            )
            if len(found) > 1:
                append_dict_list(self.headers_duplicates, title,
                                 (header, lang))
                if settings.ALLOW_CYR_PRINT:
                    print '#headers_duplicates:', title, u'—', header
예제 #17
0
파일: spaces.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     # if self.i > 10000: self.stop()
     super(MiningTemplatesSpaces, self).lang_action(page, lang, content)
     title = page.title
     sections = P.section_mining.findall(content)
     for section in sections:
         items = P.template_any.findall(section[1])
         for item in items:
             tpl = item[0]
             values = item[1].split('|')
             name = values[0]
             filtered = G.mining.templates.sure_sure + G.mining.labels.all
             if name not in filtered and name.strip() in filtered \
                     and name.strip() != u'семантика':
                 append_dict_list(
                     self.mining_templates_spaces,
                     name, (title, tpl))
예제 #18
0
    def section_action(self, page, lang, section_content):
        super(MorphoWrongHyphens, self).section_action(page, lang, section_content)
        title = page.title
        # if title.startswith('-') or title.endswith('-'):
        #     return
        # if title.startswith('*'):
        #     return
        m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=',  # u'\n)===',
                      section_content, re.UNICODE | re.DOTALL)
        if not m:
            return
        body = m.group(2)
        p = re.compile(
            u"""(\{\{
                (?P<title>морфо\s*)  # заголовок
                \|(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE)
        parts = p.findall(body)
        for part in parts:
            # if part[1] == u'морфо-ru':
            #     continue
            params = part[2]
            # try:
            #     print params
            # except UnicodeEncodeError:
            #     # print '\n', '#' * 100, '\n', repr(params), '\n', '#' * 100
            #     print '###', repr(params)

            values = part[2].strip().split('|')
            for value in values:
                value = value.strip()
                if '=' in value:
                    param, value = value.split('=', 1)
                # try:
                #     print value
                # except UnicodeEncodeError:
                #     print '#', repr(value)
                if '-' in value:
                    # try:
                    #     print params
                    # except UnicodeEncodeError:
                    #     print '###', repr(params)
                    append_dict_list(self.morpho_wrong_hyphens, title, part[0])
                    break
예제 #19
0
 def content_action(self, page, content, redirect, **kwargs):
     super(OtherSyntaxErrors, self).content_action(page, content, redirect,
                                                   **kwargs)
     title = page.title
     if ':' in title:
         return
     if title == u"Заглавная страница":  # todo: move it to HeadersIterator
         return
     m = re.search('^(=+.*=+)([^=\n]+)$', content, re.MULTILINE)
     if m:
         ok = m.group(1)
         wrong = m.group(2)
         wrong = wrong.replace(' ', '&nbsp;')
         wrong = wrong.replace('\t', '&nbsp;' * 4)
         line = u"<nowiki>{}</nowiki><span style='background-color: #FFBBBB'><nowiki>{}</nowiki></span>".format(ok, wrong)
         append_dict_list(self.other_syntax_errors, title, line)
         if settings.ALLOW_CYR_PRINT:
             print '#syntax_errors:', title, u'—', line
예제 #20
0
 def section_action(self, page, lang, section_content):
     super(WithoutMorphoRu, self).section_action(page, lang, section_content)
     title = page.title
     if lang != 'ru':
         return
     m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if not m:
         return
     body = m.group(2)
     if u'{{abbrev' in body or u'{{сокращ' in body or u'Аббревиатура' in body:
         return
     p = re.compile(
         u"""(\{\{
             (?P<title>морфо(-ru)?\s*)  # заголовок
             \|[^}]*  # параметры
             \}\})""",
         flags=re.UNICODE + re.DOTALL + re.VERBOSE)
     if not p.search(body):
         append_dict_list(self.without_morpho, title, '')
예제 #21
0
파일: category.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     # if self.i > 10000: self.stop()
     super(CategoryTemplate, self).lang_action(page, lang, content)
     title = page.title
     items = P.template_category.findall(content)
     # if not items:
     #     append_dict_list(self.category_template[u'Шаблон отсутствует'],
     #                      title, lang)
     for item in items:
         tpl = item[0]
         args = item[1]
         value = u"%s: %s" % (lang, tpl)
         if u'|язык=' not in args:
             append_dict_list(
                 self.category_template[u'Параметр "язык" отсутствует'],
                 title, value)
             continue
         if not args.startswith(u'|язык='):
             append_dict_list(
                 self.category_template[u'Параметр "язык" не первый'],
                 title, value)
             continue
         m = re.search(u'\|язык=([^|}]*)', args, re.UNICODE)
         lang_value = m.group(1)
         if not lang_value:
             append_dict_list(
                 self.category_template[u'Параметр "язык" пустой'],
                 title, value)
             continue
         if lang_value != lang:
             if u'{{-%s-' % lang_value in page.content:
                 append_dict_list(
                     self.category_template[
                         u'Язык не соответствует (другая секция)'],
                     title, value)
             else:
                 append_dict_list(
                     self.category_template[
                         u'Язык не соответствует (секции нет)'],
                     title, value)
             continue
예제 #22
0
파일: start.py 프로젝트: 2vitalik/words
 def mining_line_action(self, title, lang, line):
     super(MiningSpecificStart, self).mining_line_action(title, lang, line)
     if re.match(r'^[0-9]', line):
         key = u'цифра'
     elif line.startswith(':'):
         key = u'двоеточие'
     elif line.startswith('*'):
         key = u'звёздочка'
     elif line.startswith('<!--'):
         key = u'комментарий'
     elif line.startswith('<'):
         key = u'тег'
     elif re.match(u'^[А-Я]', line, re.UNICODE):
         key = u'большая буква'
     elif re.match(r'^[a-z]', line, re.IGNORECASE):
         key = u'латиница'
     elif re.match(u'^[-«¤,.■"—]', line, re.UNICODE):
         key = u'необычный символ'
     else:
         return
     append_dict_list(self.mining_start, key, (title, lang, line))
예제 #23
0
파일: length.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     # if self.i > 10000: self.stop()
     super(LengthTemplate, self).lang_action(page, lang, content)
     title = page.title
     if u' ' in title:
         return
     items = P.template_length.findall(content)
     # if not items:
     #     append_dict_list(self.category_template[u'Шаблон отсутствует'],
     #                      title, lang)
     for item in items:
         tpl = item[0]
         args = item[1]
         value = u"%s: %s" % (lang, tpl)
         if not re.match(u'\|\d+\|[^}|+]', args):
             append_dict_list(
                 self.length_template[u'Параметры расположены нестандартно'],
                 title, value)
             continue
         m = re.search(u'\|(\d+)\|(lang=)?([^|}]+)', args, re.UNICODE)
         lang_value = m.group(3)
         len_value = m.group(1)
         # if not lang_value:
         #     append_dict_list(
         #         self.category_template[u'Параметр языка пустой'],
         #         title, value)
         #     continue
         if lang_value != lang:
             if u'{{-%s-' % lang_value in page.content:
                 append_dict_list(
                     self.length_template[u'Язык не соответствует (другая секция)'],
                     title, value)
             else:
                 append_dict_list(
                     self.length_template[u'Язык не соответствует (секции нет)'],
                     title, value)
         len_title = len(title)
         # len_title -= len(re.findall(u'-', title))
         if len_title != int(len_value):
             # key = u'Длина возможно неверная/Возможно {}'.format(len_title)
             # self.length_template.setdefault(key, dict())
             # if key not in self.length_template_list:
             #     self.length_template_list.append(key)
             # append_dict_list(
             #     self.length_template[key],
             #     title, value)  # + (u", len=%s?" % len_title))
             append_dict_list(
                 self.length_template[u'Длина возможно неверная'],
                 title, value + (u" -> %s" % len_title))
예제 #24
0
파일: figure.py 프로젝트: 2vitalik/words
 def mining_line_action(self, title, lang, line):
     super(MiningFigure, self).mining_line_action(title, lang, line)
     if ignore_start(line) or ignore_italic(line):
         return
     if re.match(u'^\{\{прото\|.*$', line, re.UNICODE):
         return  # key = u'прото'
     elif re.match(u'^\{\{Нужен перевод *(\|\w+)?\}\}$$', line, re.UNICODE):
         key = u'Нужен перевод'
     elif re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, re.UNICODE):
         key = u'длина слова'
     elif re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, re.UNICODE):
         key = u'шаблон илл'
     elif line.startswith(u'}} {{пример') \
             or line.startswith(u'{{списки семантических связей')\
             or line.startswith(u'|'):
         key = u'списки семантических связей'
     elif re.match(u'^\{\{(музы|месяцы|неделя) \w+\}\}$', line, re.UNICODE):
         key = u'музы, месяцы, неделя'
     elif "{" in line:
         key = u'другие'
     else:
         return
     append_dict_list(self.mining_figure, key, (title, lang, line))
예제 #25
0
 def section_action(self, page, lang, section_content):
     super(OldMorphoRu, self).section_action(page, lang, section_content)
     title = page.title
     if lang != 'ru':
         return
     m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=',
                   section_content, re.UNICODE | re.DOTALL)
     if not m:
         return
     body = m.group(2)
     p = re.compile(
         u"""(\{\{
             (?P<title>морфо\s*)  # заголовок
             \|(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
             \}\})""",
         flags=re.UNICODE + re.DOTALL + re.VERBOSE)
     parts = p.findall(body)
     for part in parts:
     #     params = part[2]
     #     params = re.sub(u'(префд|прист|суфф-?|корень|инф|соед|оконч-?|интер|суффд|частица|постфикс)\d*=', '', params)
     #     params = re.sub(u'\|', '', params).strip()
     #     if not params:
     #         append_dict_list(self.old_empty_morpho, title, part[0])
         append_dict_list(self.old_morpho, title, part[0])
예제 #26
0
 def section_action(self, page, lang, section_content):
     super(ContentMixedCyrlLatnExtra, self).section_action(page, lang, section_content)
     title = page.title
     # if self.i > 3000:
     #     self.stop()
     results = regex.findall(u'\p{IsCyrillic}\s\p{isLatin}\s|\s\p{isLatin}\s\p{IsCyrillic}', section_content, re.UNICODE)
     if not results:
         return
     for found in results:
         values = re.findall(u'\s\S*%s\S*\s' % found, section_content)
         # # print title.encode('utf-8'), found.encode('utf-8')
         for value in values:
             if check_in([u'I', u'X', u'V'], found):
                 append_dict_list(self.content_mixed_cyrl_latn_extra[u'Roman'], title, value)
             elif check_in([u'c', u'o'], found):
                 append_dict_list(self.content_mixed_cyrl_latn_extra[u'Wrong?'], title, value)
             elif re.search(u'[bdfghkmnqrstuvwzDFGLNQRSUVWYZ]', found):
                 append_dict_list(self.content_mixed_cyrl_latn_extra[u'Okay?'], title, value)
             else:
                 append_dict_list(self.content_mixed_cyrl_latn_extra[u'Other'], title, value)
예제 #27
0
파일: labels.py 프로젝트: 2vitalik/words
 def lang_action(self, page, lang, content):
     # if self.i > 100: self.stop()
     super(LabelsLanguageInMining, self).lang_action(page, lang, content)
     title = page.title
     # print page.pk
     # print '=' * 100
     # print title, lang
     # print '-' * 100
     # print page.content
     # print '-' * 100
     # print content
     # print '-' * 100
     sections = P.section_mining.findall(content)
     for section in sections:
         body = section[1]
         items = P.template_any.findall(body)
         for item in items:
             tpl = item[0]
             values = item[1].split('|')
             report_value = u"%s: %s" % (lang, tpl)
             name = values[0].strip()
             if name not in G.mining.labels.all:
                 continue
             # проверить перен. на зачин? и на язык?
             if name in G.mining.labels.args_2 and len(values) > 3 \
                     and name not in G.mining.labels.args_3 \
                     or name in G.mining.labels.args_1 and len(values) > 2 \
                     or name in G.mining.labels.args_0 and len(values) > 1:
                 append_dict_list(
                     self.labels_language_IM[u'Лишние параметры'],
                     title, report_value)
                 continue
             lang_value = values[1] if len(values) > 1 else ''
             if name in (G.mining.labels.args_1 + G.mining.labels.args_2):
                 if not lang_value:
                     append_dict_list(
                         self.labels_language_IM[u'Параметр языка пустой'],
                         title, report_value)
                     continue
                 if lang_value != lang:
                     append_dict_list(
                         self.labels_language_IM[u'Язык не соответствует'],
                         title, report_value)
                     continue
예제 #28
0
파일: labels.py 프로젝트: 2vitalik/words
 def sub_section_action(self, page, lang, sub_header, sub_section_content):
     # if self.i > 100000: self.stop()
     super(LabelsLanguageNotInMining, self).\
         sub_section_action(page, lang, sub_header, sub_section_content)
     title = page.title
     if u'Значение' in sub_header:
         return
     items = P.template_any.findall(sub_section_content)
     for item in items:
         tpl = item[0]
         values = item[1].split('|')
         report_value = (tpl, lang, sub_header.replace('=', '').strip())
         name = values[0].strip()
         if name not in G.mining.labels.all:
             continue
         # проверить перен. на зачин? и на язык?
         if name in G.mining.labels.args_2 and len(values) > 3 \
                 and name not in G.mining.labels.args_3 \
                 or name in G.mining.labels.args_1 and len(values) > 2 \
                 or name in G.mining.labels.args_0 and len(values) > 1:
             append_dict_list(
                 self.labels_language_NIM[u'Лишние параметры'],
                 title, report_value)
             continue
         lang_value = values[1] if len(values) > 1 else ''
         if name in (G.mining.labels.args_1 + G.mining.labels.args_2):
             if not lang_value:
                 append_dict_list(
                     self.labels_language_NIM[u'Параметр языка пустой'],
                     title, report_value)
                 continue
             if lang_value != '-':
                 append_dict_list(
                     self.labels_language_NIM[u'Язык не соответствует'],
                     title, report_value)
                 continue
예제 #29
0
파일: absent.py 프로젝트: 2vitalik/words
    def section_action(self, page, lang, section_content):
        super(NoMorphology, self).section_action(page, lang, section_content)
        title = page.title
        if title.startswith('-') or title.endswith('-'):
            return
        if title.startswith('*'):
            return

        if lang in ['INT', 'mul', 'Zmth', 'Hani', 'hani', 'hanzi']:
            return

        headers = convert_headers(self.get_headers(section_content))
        if not headers:
            return

        if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers:
            append_dict_list(self.no_morphology[u'Все результаты'], title, lang)
            if section_content.strip().startswith(u'Существительное') \
                    or section_content.strip().startswith(u'Прилагательное') \
                    or section_content.strip().startswith(u'Глагол') \
                    or section_content.strip().startswith(u'Наречие'):
                append_dict_list(self.no_morphology[u'Часть речи'], title, lang)
            elif re.search('^\{\{(сущ|прил|гл|adv|падежи|нар|interj) ', section_content.strip(), re.UNICODE):
                append_dict_list(self.no_morphology[u'Шаблон часть речи'], title, lang)
            elif section_content.strip().startswith(u'{{Форма'):
                append_dict_list(self.no_morphology[u'Шаблон форма'], title, lang)
            elif section_content.strip().startswith(u'{{длина слова'):
                append_dict_list(self.no_morphology[u'Шаблон длина слова'], title, lang)
            elif section_content.strip().startswith(u'<b>'):
                append_dict_list(self.no_morphology[u'Жирность1'], title, lang)
            elif section_content.strip().startswith(u"'''"):
                append_dict_list(self.no_morphology[u'Жирность2'], title, lang)
            elif u'Тип и синтаксические свойства сочетания' in section_content:
                append_dict_list(self.no_morphology[u'Словосочетания'], title, lang)
            elif section_content.strip().startswith(u'==='):
                append_dict_list(self.no_morphology[u'Пусто'], title, lang)
            else:
                append_dict_list(self.no_morphology[u'Остальное'], title, lang)
            if settings.ALLOW_CYR_PRINT:
                print u'absent # [[{}]] (секция "{}")'.format(title, lang)
예제 #30
0
파일: order.py 프로젝트: 2vitalik/words
    def headers_action(self, page, headers, redirect):
        super(WrongStructure, self).header_action(page, page, headers,
                                                  redirect)
        title = page.title
        if ':' in title:
            return
        if title == u"Заглавная страница":  # todo: move it to HeadersIterator
            return
        if redirect:
            return

        # print '=' * 120
        # print title
        # print "\n".join(convert(headers))
        # print '-' * 120

        current_absent = dict()
        current_wrong_order = dict()
        has_unknown_header = False
        wrong_order = False
        langs, lang_sections = self.group_headers(headers)
        for lang in langs:
            section = lang_sections[lang]
            # print u"#{}".format(lang)
            converted_section = convert_headers(section)
            if lang == u'{{-ru-}}':
                if ' ' in title:
                    template = templates['ru']['phrase']
                else:
                    template = templates['ru']['word']
            else:
                if ' ' in title:
                    template = templates['xx']['phrase']
                else:
                    template = templates['xx']['word']
            if converted_section == []:
                # print 'EMPTY'
                pass
            elif converted_section == template:
                # print 'OK'
                pass
            else:
                _current_absent = list()
                for header in template:
                    if header not in converted_section:
                        # print header, ' -> ABSENT WARNING'
                        append_dict_list(current_absent, lang, header)
                        self.absent_headers.add(header)
                t = 0
                s = 0
                while t < len(template) and s < len(converted_section):
                    if template[t] == converted_section[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in _current_absent:
                            t += 1
                        else:
                            wrong_order = True
                for header in converted_section:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)
            # print '-' * 40
        # print
        if current_absent and not has_unknown_header and not wrong_order:
            self.header_absent[title] = current_absent
        if not self.i % 1000:
            print len(self.header_absent)
        if not self.i % 10000:
            for header in self.absent_headers:
                print header