def test_util_file_create_if_necessary__file_read(self): if self._test_exec("test_file_create_if_necessary"): Prg = self.Prg Fname = os.path.join(Prg["DirWork"], "test_file_create_if_necessary.txt") util.file_del(Fname) Content = "Cloud\nRain\nSun\r\nSnow " Created = util.file_create_if_necessary(Prg, Fname, ContentDefault=Content) self.assertTrue(Created) RetRead, ContentReadAll = util.file_read_all(Prg, Fname) self.assertEqual(ContentReadAll, Content) ContentReadLines = util.file_read_lines(Prg, Fname, Lower=True) LinesWanted = ["cloud\n", "rain\n", "sun\r\n", "snow "] self.assertEqual(LinesWanted, ContentReadLines) ContentReadLines = util.file_read_lines(Prg, Fname, Strip=True) LinesWanted = ["Cloud", "Rain", "Sun", "Snow"] self.assertEqual(LinesWanted, ContentReadLines) util.file_del(Fname)
def file_sentence_create(Prg, FileSentencesAbsPath, Text="", FileTextAbsPath=""): if not os.path.isfile(FileSentencesAbsPath): if FileTextAbsPath: # for testing it's easier to get Text from param - and not create/del tmpfile _ReadSuccess, Text = util.file_read_all(Prg, Fname=FileTextAbsPath, CheckIsFile=False) Sentences = text.sentence_separator(Text) SentencesFiltered = [] KnownSentences = set() ###### filter sentences with too much numbers ################## for Sentence in Sentences: ############## word with/without nums ratio filter #################### WordNumRatioLow = False WordsHasNum, WordsWithoutNum = util.count_words_with_num(Sentence) if WordsWithoutNum >= WordsHasNum * 3: WordNumRatioLow = True ################### avoid duplications in one text #################### if Sentence in KnownSentences: Repeated = True else: Repeated = False KnownSentences.add(Sentence) ####################################################################### if WordNumRatioLow and not Repeated: SentencesFiltered.append(Sentence) ###### filter sentences with too much numbers ################## util.file_write_utf8_error_avoid(Prg, FileSentencesAbsPath, "\n".join(SentencesFiltered))
def obj_from_file(JsonFileName, UseFilePointer=False): try: if UseFilePointer: with open(JsonFileName) as Fp: return "ok", json.load(Fp) else: _ReadStatus, FileContent = util.file_read_all(Prg={}, Fname=JsonFileName) return "ok", json.loads(FileContent) except json.decoder.JSONDecodeError: FileContent = util.file_read_all_simple(JsonFileName) Msg = f"Json decoder error: {JsonFileName}:>>" + FileContent + "<<" print(Msg) return "error", Msg
def docs_copy_samples_into_dir(Prg, DirTarget): util.dir_create_if_necessary(Prg, DirTarget) # sample texts are gzipped for FileName in util.files_abspath_collect_from_dir(Prg["DirTextSamples"]): # don't duplicate other files, document info json for example if ".gz" in FileName[-3:]: BaseName = os.path.basename(FileName).replace(".gz", "") # print(f"Sample doc duplication... {BaseName} {FileName}") ReadSuccess, TextContent = util.file_read_all(Prg, FileName, Gzipped=True) FileNameSaved = os.path.join(DirTarget, BaseName) # if a previous file exists with same name, it overwrites util.file_write_utf8_error_avoid(Prg, FileNameSaved, TextContent)
def file_index_create(Prg, FileIndexAbsPath, FileSentencesAbsPath, SubSentenceMultiplier=100, WordPositionMultiplier=100, ForcedWrite=False): if "TestExecution" in Prg: # in multicore indexing we can't see it in Prg but during tests I use single core if Prg["TestExecution"]: ForcedWrite = True if (not os.path.isfile(FileIndexAbsPath)) or ForcedWrite: WordPositions = dict() _, TextAll = util.file_read_all(Prg, FileSentencesAbsPath, CheckIsFile=False) TextAll = TextAll.lower() # more than one minus: -- or --- signs: replace them TextAll = text.replace_regexp(TextAll, "[-][-]+", " ") Lines = TextAll.split("\n") # one sentence is in one line, it's guaranted SubSentenceMultiplyerMinusOne = SubSentenceMultiplier - 1 WordPositionMultiplyerMinusOne = WordPositionMultiplier - 1 for LineNum, Line in enumerate(Lines): LineNumMultiplied = LineNum * SubSentenceMultiplier * WordPositionMultiplier _, SubSentences = text.subsentences(Prg, Line) for SubSentenceNum, SubSentence in enumerate(SubSentences): if SubSentenceNum > SubSentenceMultiplyerMinusOne: SubSentenceNum = SubSentenceMultiplyerMinusOne # the last num that we can represent LineSubWordBase = LineNumMultiplied + SubSentenceNum * WordPositionMultiplier indexing(WordPositions, SubSentence, LineSubWordBase, WordPositionMultiplyerMinusOne) Out = [] for Word, LineNumsInts in WordPositions.items(): LineNums = [] #easier to debug it instead of list comprehension for Num in LineNumsInts: try: LineNums.append(str(Num)) except: print("index problem:", Num) Out.append(f'"{Word}": [{",".join(LineNums)}]') Content = "{\n"+"\n,".join(Out) + "\n}" util.file_write_with_check(Prg, Fname=FileIndexAbsPath, Content=Content)
def test_exec(Args): DirPrgExecRoot = config.get_dir_prg_exec_root() DirDocuments = os.path.join(DirPrgExecRoot, "test_files", "documents_user_dir_simulator") Prg = config.prg_config_create(TestExecution=True, PrintForDeveloper=False, DirDocuments=DirDocuments) print("\n" * 22) print( "##################### TEST BEGIN #####################################" ) import test_converter import test_document import test_eng import test_result_selectors import test_seeker import test_seeker_logic import test_text import test_tokens import test_ui_html import test_util import test_util_json import test_util_ui # for token testing I need a real, huge text base. # to avoid plus storage, I use the text samples BooksForTest = [("_novels", "WilliamShakespeare__CompleteWorks__gutenberg_org_100-0")] for Book in BooksForTest: DirSample, BookBaseName = Book PathTest = os.path.join(Prg["DirDocuments"], BookBaseName + ".txt") PathSource = os.path.join(Prg["DirTextSamples"], DirSample, BookBaseName + ".txt.gz") if not os.path.isfile(PathTest): print("Doesn't EXIST:", PathTest) #print(PathSource) _, Txt = util.file_read_all(Fname=PathSource, Gzipped=True) util.file_write_utf8_error_avoid(dict(), Fname=PathTest, Content=Txt) else: print(" EXISTs:", PathTest) test_util.run_all_tests(Prg) test_util_json.run_all_tests(Prg) test_document.run_all_tests(Prg) test_converter.run_all_tests(Prg) test_text.run_all_tests(Prg) test_seeker.run_all_tests(Prg) test_seeker_logic.run_all_tests(Prg) test_util_ui.run_all_tests(Prg) test_eng.run_all_tests(Prg) test_ui_html.run_all_tests(Prg) test_tokens.run_all_tests(Prg) test_result_selectors.run_all_tests(Prg) util_test.result_all(Prg) print( "##################### TEST END #####################################") sys.exit(0) # print("\n"*22) ### EXTRAS AFTER BASIC TESTS ### # execute search from ui seeker.be_ready_to_seeking(Prg) ui_console.seek_and_display(Prg, Prg["QueryExamples"]["bird_or_cat"])
def test_file_read_all(self): TxtRaw = " Test Line 1\n\n Test Line 3" Path = os.path.join(self.Prg["DirPrgParent"], "test", "test_file_read_lines.txt") self.assertEqual(TxtRaw, util.file_read_all(self.Prg, Path))
def test_util_file_write_append_del(self): if self._test_exec("test_file_write_append_del"): Prg = self.Prg Ret = util.file_write(Prg, Fname="") self.assertFalse(Ret) Content = "apple " Fname = os.path.join(Prg["DirWork"], "test_file_write.txt") RetWrite = util.file_write(Prg, Fname=Fname, Content=Content) RetAppend = util.file_append(Prg, Fname=Fname, Content="tree") self.assertTrue(RetWrite) self.assertTrue(RetAppend) RetRead, ContentRead = util.file_read_all(Prg, Fname) self.assertTrue(RetRead) self.assertEqual(ContentRead, "apple tree") FileState, FileGzipped = util.file_is_gzipped(Prg, Fname) self.assertEqual("file_exists", FileState) self.assertEqual("not_gzipped", FileGzipped) Sample = "Árvíztűrő tükörfúrógép" RetWriteGz = util.file_write(Prg, Fname=Fname, Content=Sample, Gzipped=True) self.assertTrue(RetWriteGz) RetReadGz, ContentReadGz = util.file_read_all(Prg, Fname, Gzipped=True) self.assertTrue(RetReadGz) self.assertEqual(ContentReadGz, Sample) FileState, FileGzipped = util.file_is_gzipped(Prg, Fname) self.assertEqual("file_exists", FileState) self.assertEqual("gzipped", FileGzipped) BinWanted = Sample.encode() util.file_write_utf8_error_avoid(Prg, Fname, Sample) BinFromFile = util.file_read_all_simple(Fname, "rb") # print("\n######### >>" + util.file_read_all(Prg, Fname)[1] + "<<") # print("\n######### >>", Sample.encode(), "<<") self.assertEqual(BinWanted, BinFromFile) FileWriteRet = util.file_write_with_check(Prg, Fname, Sample) TxtFromFile = util.file_read_all_simple(Fname) self.assertEqual(TxtFromFile, Sample) self.assertTrue(FileWriteRet) util.file_write_with_check(Prg, Fname, "") # clear the content of the file # writing is unsuccessful because writer fun doesn't do anything def empty_writer_fun(Prg, Fname, Sample): pass FileWriteRet = util.file_write_with_check( Prg, Fname, Sample, WriterFun=empty_writer_fun) self.assertFalse(FileWriteRet) RetDel1 = util.file_del(Fname) RetDel2 = util.file_del(Fname) self.assertTrue(RetDel1) self.assertFalse(RetDel2) FileState, FileGzipped = util.file_is_gzipped(Prg, Fname) self.assertEqual("file_not_found", FileState) self.assertEqual("", FileGzipped)