def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) parts = get_parts(slug, '\n\n') for desc in parts: desc = prettify(join_lines(desc)) first_words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d]*\.?\.?\.?)', desc, re.UNICODE) if len(first_words) != 1: print u'Ошибка в первом слове:' print '#', desc word = first_words[0] words = [word] cutted = desc[len(word):] other_words = re.findall(u'\W([А-ЯЁ][-А-ЯЁ\d]+\.?\.?\.?)\W', cutted, re.UNICODE) # todo: слова с пробелами: "ВСЕ Ж ТАКИ" (2 случая) for word in other_words: length = cutted.index(word) + len(word) wrong_words = [ u'США', u'США.', u'СССР', u'ССР', u'СЯ', u'СЯ1', u'ЧК', u'СЯ2', u'ТЕ', u'СЯ1-2', u'ЭВМ', u'ВЛКСМ', u'Т-34'] # 70 получено опытным путем и годится только для этого файла if length <= 70 and word not in wrong_words: words.append(word) for i in range(len(words)): word = remove_last_dot(words[i]) if re.search('\d-\d-\d$', word): word = word[:-5] elif re.search('\d-\d$', word): word = word[:-3] elif re.search('\d$', word): word = word[:-1] elif re.search('\d-\d\.\.\.$', word): word = word[:-6] + word[-3:] elif re.search('\d\.\.\.$', word): word = word[:-4] + word[-3:] elif re.search('\d', word): print u'Ошибка! Цифра в слове: ', word words[i] = word words = set(words) # remove duplicates for word in words: if not check_word(word, debug): # "-" and "..." continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_parts(slug, '\n ') for desc in lines:#[:1000]: desc = prettify(join_lines(desc)) words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d\s?]*)\W', desc, re.UNICODE) if not words: bulk.append_desc(desc) continue word = prettify(words[0], encoding=False) if not check_word(word, debug): continue bulk.add(word, desc) bulk.process()