def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_lines(slug) for line in lines: word, desc = line.split('#') word = prettify(word) desc = prettify(desc) if not check_word(word, debug): # "-", "." and " " continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_parts(slug, '\n ') for desc in lines:#[:1000]: desc = prettify(join_lines(desc)) words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d\s?]*)\W', desc, re.UNICODE) if not words: bulk.append_desc(desc) continue word = prettify(words[0], encoding=False) if not check_word(word, debug): continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) lines = get_lines(slug) for line in lines: word, desc = line.split('#', 1) # todo: не покрывает два случая: # - Господин# (ж. р. госпожа)#, владелец, ... # - Приходить# (прийти# ), прибыть, ... # todo: c запятыми можно отдельно повозиться: # - аутентичный (автентичный, отентичный) # - барон, баронет # - бросать деньги (за окно, на ветер) word = prettify(word).upper() desc = prettify(remove_begin(desc, [',', '||']).strip()) if not check_word(word, debug): # "-", "," and " " continue bulk.add(word, desc) bulk.process()
def load(chunk_len=200, debug=False): for source_slug, name_slug in slugs: print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) slug = "%s_%s" % (source_slug, name_slug) lines = get_lines(slug) for value in lines: value = prettify(value, remove_dot=False) if not check_word(value, debug): # "-" and "." continue bulk.add(value) bulk.process()
def load(chunk_len=200, debug=False): print source_slug print name_slug bulk = Bulk(source_slug, name_slug, chunk_len) parts = get_parts(slug, '\n\n') for desc in parts: desc = prettify(join_lines(desc)) first_words = re.findall(u'^([-А-ЯЁ\d][-А-ЯЁ\d]*\.?\.?\.?)', desc, re.UNICODE) if len(first_words) != 1: print u'Ошибка в первом слове:' print '#', desc word = first_words[0] words = [word] cutted = desc[len(word):] other_words = re.findall(u'\W([А-ЯЁ][-А-ЯЁ\d]+\.?\.?\.?)\W', cutted, re.UNICODE) # todo: слова с пробелами: "ВСЕ Ж ТАКИ" (2 случая) for word in other_words: length = cutted.index(word) + len(word) wrong_words = [ u'США', u'США.', u'СССР', u'ССР', u'СЯ', u'СЯ1', u'ЧК', u'СЯ2', u'ТЕ', u'СЯ1-2', u'ЭВМ', u'ВЛКСМ', u'Т-34'] # 70 получено опытным путем и годится только для этого файла if length <= 70 and word not in wrong_words: words.append(word) for i in range(len(words)): word = remove_last_dot(words[i]) if re.search('\d-\d-\d$', word): word = word[:-5] elif re.search('\d-\d$', word): word = word[:-3] elif re.search('\d$', word): word = word[:-1] elif re.search('\d-\d\.\.\.$', word): word = word[:-6] + word[-3:] elif re.search('\d\.\.\.$', word): word = word[:-4] + word[-3:] elif re.search('\d', word): print u'Ошибка! Цифра в слове: ', word words[i] = word words = set(words) # remove duplicates for word in words: if not check_word(word, debug): # "-" and "..." continue bulk.add(word, desc) bulk.process()