def update_dicts( self, par ): # par - по чему итерируемся, например: 'paragraph' (подробнее в coll) for file in os.listdir(self.dir): d1, d2 = coll.iter_by_docs(file, self.dir, par, 0) self.num_to_text.update(d1) self.text_to_num.update(d2) self.num_to_name.update( coll.iter_by_docs(file, self.dir, 'art_name', 1))
def docs_parser(dir): cash = dict() morph = MorphAnalyzer() stop_words = stopwords.words("russian") for file in os.listdir(dir): for i_d in iter_by_docs(file, dir, 'chapter', 1): t = Tokenizer(i_d)
def all_articles_in_codexes(dir: str) -> tp.List[tp.Tuple[str, str]]: # Возвращает список всех статей all_codex_name = [filename for filename in os.listdir(dir)] ans_codexes: tp.List[tp.Tuple[str, str]] = [] for codex in all_codex_name: ans_codexes.extend( coll.iter_by_docs(codex, dir, 'art_name2', 1).keys()) return ans_codexes
def build_inversed_index( self, par): # par - по чему итерируемся, например: 'paragraph' t = tqdm(total=len(os.listdir(self.dir))) for file in os.listdir(self.dir): for i_d in coll.iter_by_docs(file, self.dir, par, 1): self.tokenizer.text = i_d tokens = self.tokenizer.tokenize(self.cash, self.morph, self.stop_words) for token in tokens: if token in self.inv_ind: if (self.text_to_num[i_d], tokens.count(token) ) not in self.inv_ind[token]: self.inv_ind[token].append( (self.text_to_num[i_d], tokens.count(token))) else: self.inv_ind[token] = [(self.text_to_num[i_d], tokens.count(token))] self.num_to_len[self.text_to_num[i_d]] = len(list(tokens)) t.update(1) t.close()
def dict_for_art_names(self): codex_path = os.path.join(PATH_TO_ROOT, "codexes") for cod in os.listdir(codex_path): _, art_n = coll.iter_by_docs(cod, codex_path, 'art_name', 1) self.art_names.update(art_n)
new_codnorm = set() codnorm = co.cod_norm for cn in codnorm: cod = cn[0] norm = cn[1] for c in cod: if (str(c), norm) in set_numbers: new_codnorm.add( (nc.name_codexes[c].lower(), 'ст ' + norm[:-1])) co.cod_norm = list(new_codnorm) ans_dict[j] = dict() ans_dict[j]["Question"] = co.question ans_dict[j]["Answer_Lawyer"] = co.answer for i in range(len(co.cod_norm)): co.cod_norm[i] = ' '.join(co.cod_norm[i]) ans_dict[j]["Answer"] = ', '.join(co.cod_norm) #json.dump(ans_dict, pic, ensure_ascii=False, indent=2) #pic.write('\n\n') json.dump(ans_dict, pic, indent=2) codexes_to_json("codexes") #norms_codexes_to_normal("codexes") ''' coll.iter_by_docs() for co in codexes_out: print(co) print(len(codexes_out)) '''
from tools.relative_paths_to_directories import path_to_directories PATH_TO_ROOT, PATH_TO_TOOLS, PATH_TO_FILES, PATH_TO_TF_IDF, PATH_TO_INV_IND, PATH_TO_BM_25, \ PATH_TO_LEARNING_TO_RANK = path_to_directories(os.getcwd()) # директория на папку с кодексами codexes_dir = os.path.join(PATH_TO_ROOT, "codexes") tokenizer = Tokenizer() simple_corp = SimpleCorp() simple_corp_art_names = SimpleCorp() for filename in tqdm(os.listdir(codexes_dir)): d1, _ = coll.iter_by_docs(filename, codexes_dir, 'article', 0) for doc_id, doc_text in d1.items(): simple_corp.add_doc(doc_id, doc_text) for filename in tqdm(os.listdir(codexes_dir)): names = coll.iter_by_docs(filename, codexes_dir, 'art_name', 1) for doc_id, doc_text in names.items(): simple_corp_art_names.add_doc(doc_id, doc_text) tokenized_corp = SimpleCorp() tokenized_corp.make_from(simple_corp, tokenizer) simple_corp.save('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp")) tokenized_corp.save('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp")) simple_corp_art_names.save('codexes_corp_art_names',