def test_util_file_create_if_necessary__file_read(self): if self._test_exec("test_file_create_if_necessary"): Prg = self.Prg Fname = os.path.join(Prg["DirWork"], "test_file_create_if_necessary.txt") util.file_del(Fname) Content = "Cloud\nRain\nSun\r\nSnow " Created = util.file_create_if_necessary(Prg, Fname, ContentDefault=Content) self.assertTrue(Created) RetRead, ContentReadAll = util.file_read_all(Prg, Fname) self.assertEqual(ContentReadAll, Content) ContentReadLines = util.file_read_lines(Prg, Fname, Lower=True) LinesWanted = ["cloud\n", "rain\n", "sun\r\n", "snow "] self.assertEqual(LinesWanted, ContentReadLines) ContentReadLines = util.file_read_lines(Prg, Fname, Strip=True) LinesWanted = ["Cloud", "Rain", "Sun", "Snow"] self.assertEqual(LinesWanted, ContentReadLines) util.file_del(Fname)
def test_util_file_read_lines(self): if self._test_exec("test_file_read_lines"): Prg = self.Prg Fname = os.path.join(Prg["DirWork"], "test_file_read_lines.txt") util.file_write(Prg, Fname=Fname, Content="cat\ndog\nelephant") Lines = util.file_read_lines(Prg, Fname, Strip=True) self.assertEqual(Lines, ["cat", "dog", "elephant"]) util.file_del(Fname)
def test_fun_pdf_to_text_converter(self): if self._test_exec("test_fun_pdf_to_text_converter"): Prg = self.Prg FileTxt = os.path.join(Prg["DirWork"], "test_converted_from_pdf.txt") util.file_del(FileTxt) FilePdf = os.path.join(Prg["DirTestFiles"], "test_pdf_conversion.pdf") Prg["ConverterPdfToText"](Prg, FilePdf, FileTxt) FileLines = util.file_read_lines(Prg, FileTxt, Strip=True) self.assertEqual(FileLines[0], "This is new document.") util.file_del(FileTxt)
def document_obj_create_in_document_objects(Prg, DocumentObjects, ConvertedFileOrigNames_AbsPath, FileTextAbsPath, FileIndexAbsPath, FileSentencesAbsPath, WordPositionInLines=None): if WordPositionInLines == None: WordPositionInLines = dict() BaseNameNoExt, DotExtension = util.basename_without_extension__ext( FileTextAbsPath) if BaseNameNoExt in ConvertedFileOrigNames_AbsPath: # I can do it with .get() but it's more descriptive BaseNameNoExtOrig, DotExtensionOrig = util.basename_without_extension__ext( FileTextAbsPath) FileOrig = BaseNameNoExtOrig + DotExtensionOrig else: FileOrig = BaseNameNoExt + DotExtension # if not Prg.get("TestExecution", False): # during test exec hide progress # info(f"{ProgressPercent} in documents dir - processed: {BaseNameNoExt}", Verbose=Verbose) global _DocsSampleSourceWebpages if not _DocsSampleSourceWebpages: _, _DocsSampleSourceWebpages = util_json_obj.obj_from_file( os.path.join(Prg["DirTextSamples"], "document_samples_source_webpages.json")) if BaseNameNoExt not in Prg["DocumentsSourceWebpages"]: if BaseNameNoExt in _DocsSampleSourceWebpages["docs"]: DocObj = _DocsSampleSourceWebpages["docs"][BaseNameNoExt] else: DocObj = { "url": "url_unknown", "source_name": "source_unknown", "license": "unknown license" } util_json_obj.doc_source_webpages_update_in_Prg( Prg, BaseNameNoExt, DocObj) # and reload the updated db DocumentObjects[BaseNameNoExt] = \ document_obj(FileOrigPathAbs=FileOrig, # if you use pdf/html, the original FileTextAbsPath=FileTextAbsPath, # and text files are different FileIndex=FileIndexAbsPath, FileSentences=FileSentencesAbsPath, WordPositionInLines=WordPositionInLines, # list of sentences Sentences=util.file_read_lines(Prg, Fname=FileSentencesAbsPath) if isfile(FileSentencesAbsPath) else [])
def parse_user_topic(desc, encoding='utf-8'): logfilename = twlda_result_file('%s/TopicsDistributionOnUsers.txt' % desc) user_topic = [] for line in file_read_lines(logfilename, encoding=encoding): data_line = line.strip().split('\t') if not data_line: continue data_line[0] = data_line[0][:-4] # Remove '.txt' data_line[1:] = list(map(float, data_line[1:])) user_topic.append(data_line) return user_topic
def test_file_create_sentences__create_index(self): self.maxDiff = None if self._test_exec("test_file_create_sentences__create_index"): Prg = self.Prg FileSentences = os.path.join(Prg["DirWork"], "test_file_create_sentences.txt") util.file_del(FileSentences) Sample = 'He is my friend. "This is \n the next - city, London -- here, in London, the sky is nice." Is this the third line, or a Book about London?' seeker.file_sentence_create(Prg, FileSentences, Sample) Wanted = [ "He is my friend. \n", # detect London only once from this sentence: '"This is the next - city, London -- here, in London, the sky is nice." \n', "Is this the third line, or a Book about London?" ] LinesFromFile = util.file_read_lines(Prg, FileSentences) self.assertEqual(Wanted, LinesFromFile) FileIndex = os.path.join(Prg["DirWork"], "test_file_create_index.txt") util.file_del(FileIndex) seeker.file_index_create(Prg, FileIndex, FileSentences, ForcedWrite=True) #seeker.file_index_create(Prg, "/tmp/index.txt", FileSentences) # print(util.file_read_all(Prg, FileIndex)) MultiSub = Prg["SubSentenceMultiplier"] MultiSubAndWord = Prg["SubSentenceMultiplier"] * Prg[ "WordPositionMultiplier"] _Status, WordPosition = util_json_obj.obj_from_file(FileIndex) self.assertEqual(set(WordPosition["london"]), set([10100, 10201, 20104])) util.file_del(FileSentences) util.file_del(FileIndex)
def test_results_load_from_mark_detection(Prg, FileResultPathElems): Marks = dict() MarkId = 0 MarkLines = list() FileResultPath = os.path.join(Prg["DirPrgParent"], *FileResultPathElems) print("File result path:", FileResultPath) for Line in util.file_read_lines(Prg, Fname=FileResultPath): Line = Line.strip() if mark_util.MarkBg not in Line and mark_util.MarkFg not in Line: if MarkLines: Marks[MarkId] = "\n".join(MarkLines) MarkLines = list() MarkId += 1 else: MarkLines.append(Line) # print("Line: ", Line) if MarkLines: Marks[MarkId] = "\n".join(MarkLines) # for Key in Marks: # print(Marks[Key]) return Marks