def test_merging(self): voc = dict() currentWorkspace = './tests/workspace/test3/' filename = 'test3' pathlist = Path("./tests/data/test3/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot, sortedByScore = query.get_posting_list( savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [ 0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]}) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)]) mot, sortedByScore = query.get_posting_list( savedVoc, "bb", filemanager, True) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)]) self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]}) mot, sortedByScore = query.get_posting_list( savedVoc, "cc", filemanager, True) self.assertEqual(mot, {3: [0, 1], 6: [0, 1]}) self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
def test_topk_trivial_file(self): pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*') filemana = filemanager.FileManager( "TestFaginsTopK", "./tests/workspace/testsfaginstopk/") tempVoc = SortedDict() for path in pathlist: analysis.analyse_newspaper(path, tempVoc, computeIDF=True) filemana.save_vocabularyAndPL_file(tempVoc) # Extraction of the saved Voc savedVoc = filemana.read_vocabulary() topk = faginstatopk.apply_fagins_ta(['aa', 'bb'], savedVoc, filemana, 0, 5) #If conjunctive : #self.checkResultApproximative(topk,[(2,(math.log(3/4)+math.log(3/2))/2)]) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2), (1, math.log(3 / 4) / 2), (3, math.log(3 / 4) / 2)]) topk = faginstatopk.apply_fagins_ta(['bb'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = faginstatopk.apply_fagins_ta(['cc'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, []) topk = faginstatopk.apply_fagins_ta(['cc', 'dd'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, [])
def test_merging_3_files_scores(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3filesscores' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc, computeIDF=True) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(True) savedVoc = filemanager.read_vocabulary() mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [ 0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]}) self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)]) mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True) self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]}) self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
def test_read_postingList(self): currentWorkspace = './tests/workspace/testfilemanager2/' filename = 'testfm2' postingList = dict() postingList[1]=[0,101] postingList[2]=[0,30023] postingList[294]=[0,159] postingList[23445]=[0,3006] filemanager = fm.FileManager(filename,currentWorkspace) filemanager.save_postList(postingList,0) pl = filemanager.read_postList(0,4) self.assertEqual(pl, {1: [0, 101], 2: [0,30023], 294: [0,159], 23445:[0,3006]}, "The sorted Dict should be the same")
def test_merging_3_files(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3files' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on a une function directe savedVoc = filemanager.read_vocabulary() mot = query.get_posting_list(savedVoc, "aa", filemanager) self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [ 0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]}) mot = query.get_posting_list(savedVoc, "bb", filemanager) self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [ 0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [ 0, 1], 4: [0, 1], 6: [0, 1]}) mot = query.get_posting_list(savedVoc, "dd", filemanager) self.assertEqual(mot, {1: [0, 2], 2: [0, 1]}) mot = query.get_posting_list(savedVoc, "ff", filemanager) self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF") mot = query.get_posting_list(savedVoc, "qq", filemanager) self.assertEqual(mot, {1: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "rr", filemanager) self.assertEqual(mot, {1: [0, 5], 21: [0, 1]}) mot = query.get_posting_list(savedVoc, "ee", filemanager) self.assertEqual(mot, {1: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "vv", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "yy", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "kk", filemanager) self.assertEqual(mot, {2: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ii", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "jj", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "hh", filemanager) self.assertEqual(mot, {23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ll", filemanager) self.assertEqual(mot, {}, 'll is considered a stopword')
def test_creation_postingLists(self): currentWorkspace = './tests/workspace/testfilemanager1/' filename = 'testfm1' postingList = dict() postingList[1]=[0,101] postingList[2]=[0,30023] postingList[34]=[0,308.0] postingList[294]=[0,159] postingList[2324]=[0,3005] postingList[23445]=[0,3006] filemanager = fm.FileManager(filename,currentWorkspace) filemanager.save_postList(postingList,0) self.assertTrue(os.path.isfile(currentWorkspace + filename + '.pl'), "The file .pl should exist") self.assertTrue(os.path.isfile(currentWorkspace + filename + '.vo'), "The file .vo should exist")
def test_topk_trivial_file(self): pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*') filemana = filemanager.FileManager( "TestFaginsTopK", "./tests/workspace/testsfaginstopk") tempVoc = dict() for path in pathlist: analysis.analyse_newspaper(path, tempVoc, computeIDF=True) filemana.save_vocabularyAndPL_file(tempVoc) # Extraction of the saved Voc savedVoc = filemana.read_vocabulary() topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = naivetopk.apply_naive_top_k_algo(['cc'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc, filemana, 0, 5, naivetopk.disjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0, 5, naivetopk.disjunctive_queries) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb'], savedVoc, filemana, 0, 1, naivetopk.disjunctive_queries) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2)]) topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb', 'cc'], savedVoc, filemana, 0, 1, naivetopk.disjunctive_queries) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 3)])
def test_modify_postingList(self): currentWorkspace = './tests/workspace/testfilemanager3/' filename = 'testfm3' postingList = dict() postingList[1]=[0,101] postingList[23]=[0,30023] postingList[234]=[0,3006] filemanager = fm.FileManager(filename,currentWorkspace) # TODO: le offset change quoi? filemanager.save_postList(postingList,0) postingList[1]=[0,201] filemanager.save_postList(postingList,0) postingList[1]=[0,301] filemanager.save_postList(postingList,0) pl = filemanager.read_postList(0,3) self.assertEqual(pl, {1: [0,301], 23: [0,30023], 234: [0,3006]}, "The sorted Dict should be the same")
def test_simple(self): voc = dict() currentWorkspace = './tests/workspace/test1/' filename = 'test1' pathlist = Path("./tests/data/test1/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc, False) savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]}) self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]}) self.assertEqual(mot3, {3: [0, 1]})
def analyse(nbNewspaper, path="./latimes/", flushEvery=1, analysisApproach=analysis.analyse_newspaper, mergeInTheEnd=True, useStemmer=True, sizeDocument=medium): """ This benchmark will analyse documents, put the VOC and PL in memory and eventually flush it to the hardrive if requested. In the end, a VOC and PL file will be created on the hardrive nbNewspaper is the number of newspaper we will go through in path path is the path to the directory flushEvery is the frequency of flush. (-1 if we never flush) mergeInTheEnd : if false, no merge in the end is proceeded and vocabulary is reset at the end of each loop """ pathlist = Path(path).glob('**/la*') vocabulary = SortedDict() filemanager = fm.FileManager("benchmarkAnalysisTest") flushCounter = 0 tmpPreprocessor = analysis.preprocessor if not useStemmer: analysis.setPreprocessor( preprocessing.Preprocessor(activateStemmer=False)) for i, newspaper_path in enumerate(pathlist): if i >= nbNewspaper: break flushCounter += 1 analysisApproach(newspaper_path, vocabulary, False) if mergeInTheEnd == False: vocabulary = SortedDict() continue if flushCounter >= flushEvery and flushEvery != 1: flushCounter = 0 filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = SortedDict() if mergeInTheEnd: filemanager.mergePartialVocsAndPL() analysis.setPreprocessor(tmpPreprocessor)
def test_with_stopwords(self): voc = dict() currentWorkspace = './tests/workspace/test2/' filename = 'test2' pathlist = Path("./tests/data/test2/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]}) self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]}) self.assertEqual(mot3, {2: [0, 2]}) stop1 = query.get_posting_list(savedVoc, "doing", filemanager) self.assertEqual(stop1, {})
def analysis_parameters(): global MAX_RANDOM_INDEXING parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, help="dossier avec les documents", required=True) parser.add_argument( "-f", type=str, help="nom de fichier pour enregistrer les fichiers après l'indexation ", required=True) parser.add_argument( "-o", type=str, default='./workspace/', help="dossier pour enregistrer les fichiers après l'indexation ") parser.add_argument("--zip", action='store_true', help="compression zip à la fin") parser.add_argument( "--partial", type=int, default=-1, help= 'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.' ) parser.add_argument("--stemmer", action='store_true', help='activer stemmer') parser.add_argument("--randomindexing", action='store_true', help='activer random indexing') args = parser.parse_args() latimes_path = args.d if not args.d.endswith("/"): latimes_path += "/" workspace_path = args.o if not args.d.endswith("/"): workspace_path += "/" pathlist = Path(latimes_path).glob('**/la*') vocabulary = dict() filemanager = fm.FileManager(args.f, workspace_path) random_indexing = None if args.randomindexing: random_indexing = ri.RandomIndexing() if args.stemmer: analysis.setPreprocessor(preprocessing.Preprocessor(True)) if args.partial == -2: print("Partial analysis in progress") for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") if args.partial != -1: nbDocsInMemory = 0 stepFlush = args.partial rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): if rand_indexing_counter < MAX_RANDOM_INDEXING: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, random_indexing, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) else: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory >= stepFlush: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 rand_indexing_counter += 1 if nbDocsInMemory != 0: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") print("Inverted file created !") else: print("Non partial") rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): if rand_indexing_counter < MAX_RANDOM_INDEXING: rand_indexing_counter += 1 analysis.analyse_newspaper(newspaper_path, vocabulary, random_indexing, False) else: analysis.analyse_newspaper(newspaper_path, vocabulary, None, False) analysis.computeIDF(vocabulary) filemanager.save_vocabularyAndPL_file(vocabulary) print("Inverted file created !") if args.zip: print("Compressing…") filemanager = fm.FileManager(args.f, args.o) zip.compressPLVBYTEFromSavedVocAndPL(filemanager) zip.compressZip(filemanager.getPathPLCompressed()) zip.compressZip(filemanager.getPathVocCompressed()) zip.compressZip(filemanager.getPathPLScore()) print("Compressed !") if args.randomindexing: filemanager.save_random_indexing(random_indexing.getTermsVectors(), random_indexing.getTermDimension()) print("Random indexing created")
postingListsOrderedById = dict() postingListsOrderedById['aaa'] = pl1_id postingListsOrderedById['bbb'] = pl2_id print('postingListsOrderedById : {}'.format(postingListsOrderedById)) print('postingListsOrderedByScore : {}'.format(postingListsOrderedByScore)) return postingListsOrderedById, postingListsOrderedByScore if __name__ == "__main__": # Applying Top K Algorithm to mockData # postingListsOrderedById, postingListsOrderedByScore = createMockData() # c = find_fagins_ta(postingListsOrderedById, postingListsOrderedByScore, 3, aggregative_function_mean) # print("Resulta c : {}".format(c)) currentWorkspace = './workspace/testfaginsta/' filename = 'test1' filemanag = fm.FileManager(filename, currentWorkspace) tempVoc = SortedDict() pathlist = Path("./tests/data/test4/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, tempVoc, True) filemanag.save_vocabularyAndPL_file(tempVoc) savedVoc = filemanag.read_vocabulary() faginsta = apply_fagins_ta(['aa', 'bb'], savedVoc, filemanag, 0.2, 2) print("result faginsTA : {}".format(faginsta))
parser.add_argument("--stemmer", action='store_true', help='activer stemmer') parser.add_argument("-n", type=str, required=True, help='nombre de synonymes pour la requête') args = parser.parse_args() workspace_path = args.d if not args.d.endswith("/"): workspace_path += "/" random_indexing = ri.RandomIndexing() filemanager = fm.FileManager(args.f, workspace_path) ri_term, ri_voc = filemanager.read_random_indexing( random_indexing.getTermDimension()) if args.stemmer: preprocessor = pp.Preprocessor(True) else: preprocessor = pp.Preprocessor(False) stemmed = preprocessor.process(args.t) try: indexToSearch = ri_term.index(stemmed[0]) print("Synonymes for : {} ".format(ri_term[indexToSearch])) res = classify(ri_voc[indexToSearch], ri_voc, int(args.n)) for i, term_index in enumerate(res): print("{:<3} : {}".format(i, ri_term[term_index]))
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToAnalyse = [] timeToComputeIDF = [] for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbNewsPaperToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbNewspaperRed = 0 nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbNewspaperRed >= nbNewsPaperToRead: break docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) nbDocsRed = docsRedInDocIteration + nbDocsRed nbNewspaperRed += 1 if nbNewspaperRed < nbNewsPaperToRead: print("Benchmark invalid, as we ran out of newspaper to read.") timeToExtract.append(time.time() - start) print("We red documents : ") print(nbDocsRed) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_newspapers) plt.plot(array_of_newspapers, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_newspapers, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_newspapers, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_newspapers, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndSaveDocuments(array_of_iterations, computeIDF=False, numberIterations=1): totaltimeToExtract = [] totaltimeToSave = [] totaltimeTotal = [] totaltimeToComputeIDF = [] for i in range(0, numberIterations): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToComputeIDF = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, 0, nbDocsToRead - nbDocsRed) nbDocsRed = docsRedInDocIteration + nbDocsRed if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_iterations) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_iterations, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_iterations, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_iterations, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show() totaltimeToExtract.append(timeToExtract) if computeIDF: totaltimeToComputeIDF.append(timeToComputeIDF) totaltimeToSave.append(timeToSave) totaltimeTotal.append(timeTotal) if computeIDF: print("computeidf") resIDF = [0] * len(totaltimeToComputeIDF[0]) for arr in totaltimeToComputeIDF: for i, elt in enumerate(arr): resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF) print(totaltimeToComputeIDF) print(resIDF) print("extract") resextract = [0] * len(totaltimeToExtract[0]) for arr in totaltimeToExtract: for i, elt in enumerate(arr): resextract[i] = resextract[i] + elt / len(totaltimeToExtract) print(totaltimeToExtract) print(resextract) print("save") ressave = [0] * len(totaltimeToSave[0]) for arr in totaltimeToSave: for i, elt in enumerate(arr): ressave[i] = ressave[i] + elt / len(totaltimeToSave) print(totaltimeToSave) print(ressave) print("total") restotal = [0] * len(totaltimeTotal[0]) for arr in totaltimeTotal: for i, elt in enumerate(arr): restotal[i] = restotal[i] + elt / len(totaltimeTotal) print(totaltimeTotal) print(restotal) plt.plot(array_of_iterations, resextract, label="Time to analyse documents") if computeIDF: plt.plot(array_of_iterations, resIDF, label="Time to compute IDF") plt.plot(array_of_iterations, ressave, label="Time to save") plt.plot(array_of_iterations, restotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndMergeDocuments(array_of_iterations, stepFlush): path = "" print("analyse_newspaper") print("Merging involved, flush frequency : Every " + str(stepFlush) + " document.") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToMerge = [] timeToFlush = [0] * len(array_of_iterations) timeTotal = [] timeToAnalyse = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 nbDocsInMemory = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsRed = docsRedInDocIteration + nbDocsRed nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead: startFlush = time.time() filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 timeToFlush[numBatch] += (time.time() - startFlush) if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) start = time.time() print("Merging in progress…") filemanager.mergePartialVocsAndPL() timeToMerge.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print(array_of_iterations) print("Ttmerge") print(timeToMerge) plt.plot(array_of_iterations, timeToMerge, label="Time to merge") print("Ttextract") print(timeToExtract) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse document (with flushing)") print("Ttflush") print(timeToFlush) plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents") print("Overalltime") print(timeTotal) plt.plot(array_of_iterations, timeTotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analysis_parameters(): parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, default='./workspace/', help="dossier avec les fichier VOC et PL résultat de l'indexation") parser.add_argument("-f", type=str, help="nom de fichier VOC et PL ", required=True) parser.add_argument("-q", type=str, help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True) parser.add_argument("-n", type=int, default=3, help="nombre de résultats souhaité de documents ") parser.add_argument("--stemmer", action='store_true', help="activer le stemming sur les termes de la requête") parser.add_argument("--algo", type=str, default="naive", help="algorithme souhaité pour la requête ") parser.add_argument("--view", type=str, default="simple", help="type de visualisation. Options possible: simple ou fullText ") parser.add_argument("--vpath", type=str, default="./data/latimes/", help="path des fichier sources pour --view fullText") parser.add_argument("--improvedquery", action='store_true', help="activer recherche de synonymes pour l'amélioration de la requête") args = parser.parse_args() latimes_path = args.d if not args.d.endswith("/"): latimes_path += "/" filemanager = fm.FileManager(args.f, latimes_path) savedVoc = filemanager.read_vocabulary() if args.stemmer: print("Stemmer activated") preprocessor = preprocessing.Preprocessor(True) else : preprocessor = preprocessing.Preprocessor(False) epsilon = 0 switchAlgo = {"naive": naivetopk.apply_naive_top_k_algo, "fagins": faginstopk.apply_top_k_algo, "faginsTA": faginsta.apply_fagins_ta} algoFunct = switchAlgo[args.algo] words = preprocessor.process(args.q) words_request = [] if args.improvedquery: random_indexing = ri.RandomIndexing() for word in words: words_request.append(word) try: synonymes = synknn.get_synonyms( word, 2, random_indexing.getTermDimension(), filemanager) if len(synonymes) == 2: words_request.append(synonymes[1]) except Exception as e: print(e) print("Improved query: {}".format(words_request)) else: words_request = words if (not filemanager.doesUnCompressedVersionExists()) and filemanager.doesCompressedVersionExists(): print("Unzipping in progress…") compressor.decompressZip(filemanager.getPathPLCompressed(),filemanager.getPathPLCompressed()) compressor.decompressZip(filemanager.getPathVocCompressed(),filemanager.getPathVocCompressed()) compressor.decompressZip(filemanager.getPathPLScore(),filemanager.getPathPLScore()) compressor.decompressPLVBYTE(filemanager) result = algoFunct(words_request, savedVoc, filemanager, epsilon, args.n) switchView = {"simple": view.displayResults, "fullText": view.displayResultsText} viewFunct = switchView[args.view] print("\nResults: ") viewFunct(result, args.vpath)