def read_source_file(path): """ Read the raw newspaper data from ./data/latimes/ and return the content of the p tags (after preprocessing) Postcondition : Returns a list of words foundt in ./data/latimes """ preprocessor = preprocessing.Preprocessor(True) pathlist = Path(path).glob('**/la*') terms = [] for i, newspaper_path in enumerate(pathlist): raw = newspaper_path.read_text() tree = etree.fromstring("<NEWSPAPER>" + raw + "</NEWSPAPER>") for document in tree.xpath("//DOC"): text = "" for p in document.xpath("./TEXT/P"): text += p.text terms[len(terms):len(terms)] = preprocessor.process(text) return terms
def analyse(nbNewspaper, path="./latimes/", flushEvery=1, analysisApproach=analysis.analyse_newspaper, mergeInTheEnd=True, useStemmer=True, sizeDocument=medium): """ This benchmark will analyse documents, put the VOC and PL in memory and eventually flush it to the hardrive if requested. In the end, a VOC and PL file will be created on the hardrive nbNewspaper is the number of newspaper we will go through in path path is the path to the directory flushEvery is the frequency of flush. (-1 if we never flush) mergeInTheEnd : if false, no merge in the end is proceeded and vocabulary is reset at the end of each loop """ pathlist = Path(path).glob('**/la*') vocabulary = SortedDict() filemanager = fm.FileManager("benchmarkAnalysisTest") flushCounter = 0 tmpPreprocessor = analysis.preprocessor if not useStemmer: analysis.setPreprocessor( preprocessing.Preprocessor(activateStemmer=False)) for i, newspaper_path in enumerate(pathlist): if i >= nbNewspaper: break flushCounter += 1 analysisApproach(newspaper_path, vocabulary, False) if mergeInTheEnd == False: vocabulary = SortedDict() continue if flushCounter >= flushEvery and flushEvery != 1: flushCounter = 0 filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = SortedDict() if mergeInTheEnd: filemanager.mergePartialVocsAndPL() analysis.setPreprocessor(tmpPreprocessor)
def analysis_parameters(): global MAX_RANDOM_INDEXING parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, help="dossier avec les documents", required=True) parser.add_argument( "-f", type=str, help="nom de fichier pour enregistrer les fichiers après l'indexation ", required=True) parser.add_argument( "-o", type=str, default='./workspace/', help="dossier pour enregistrer les fichiers après l'indexation ") parser.add_argument("--zip", action='store_true', help="compression zip à la fin") parser.add_argument( "--partial", type=int, default=-1, help= 'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.' ) parser.add_argument("--stemmer", action='store_true', help='activer stemmer') parser.add_argument("--randomindexing", action='store_true', help='activer random indexing') args = parser.parse_args() latimes_path = args.d if not args.d.endswith("/"): latimes_path += "/" workspace_path = args.o if not args.d.endswith("/"): workspace_path += "/" pathlist = Path(latimes_path).glob('**/la*') vocabulary = dict() filemanager = fm.FileManager(args.f, workspace_path) random_indexing = None if args.randomindexing: random_indexing = ri.RandomIndexing() if args.stemmer: analysis.setPreprocessor(preprocessing.Preprocessor(True)) if args.partial == -2: print("Partial analysis in progress") for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") if args.partial != -1: nbDocsInMemory = 0 stepFlush = args.partial rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): if rand_indexing_counter < MAX_RANDOM_INDEXING: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, random_indexing, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) else: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory >= stepFlush: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 rand_indexing_counter += 1 if nbDocsInMemory != 0: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") print("Inverted file created !") else: print("Non partial") rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): if rand_indexing_counter < MAX_RANDOM_INDEXING: rand_indexing_counter += 1 analysis.analyse_newspaper(newspaper_path, vocabulary, random_indexing, False) else: analysis.analyse_newspaper(newspaper_path, vocabulary, None, False) analysis.computeIDF(vocabulary) filemanager.save_vocabularyAndPL_file(vocabulary) print("Inverted file created !") if args.zip: print("Compressing…") filemanager = fm.FileManager(args.f, args.o) zip.compressPLVBYTEFromSavedVocAndPL(filemanager) zip.compressZip(filemanager.getPathPLCompressed()) zip.compressZip(filemanager.getPathVocCompressed()) zip.compressZip(filemanager.getPathPLScore()) print("Compressed !") if args.randomindexing: filemanager.save_random_indexing(random_indexing.getTermsVectors(), random_indexing.getTermDimension()) print("Random indexing created")
help='activer stemmer') parser.add_argument("-n", type=str, required=True, help='nombre de synonymes pour la requête') args = parser.parse_args() workspace_path = args.d if not args.d.endswith("/"): workspace_path += "/" random_indexing = ri.RandomIndexing() filemanager = fm.FileManager(args.f, workspace_path) ri_term, ri_voc = filemanager.read_random_indexing( random_indexing.getTermDimension()) if args.stemmer: preprocessor = pp.Preprocessor(True) else: preprocessor = pp.Preprocessor(False) stemmed = preprocessor.process(args.t) try: indexToSearch = ri_term.index(stemmed[0]) print("Synonymes for : {} ".format(ri_term[indexToSearch])) res = classify(ri_voc[indexToSearch], ri_voc, int(args.n)) for i, term_index in enumerate(res): print("{:<3} : {}".format(i, ri_term[term_index])) except ValueError as e: print(args.t + ' is not in the indexed list')
def analysis_parameters(): parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, default='./workspace/', help="dossier avec les fichier VOC et PL résultat de l'indexation") parser.add_argument("-f", type=str, help="nom de fichier VOC et PL ", required=True) parser.add_argument("-q", type=str, help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True) parser.add_argument("-n", type=int, default=3, help="nombre de résultats souhaité de documents ") parser.add_argument("--stemmer", action='store_true', help="activer le stemming sur les termes de la requête") parser.add_argument("--algo", type=str, default="naive", help="algorithme souhaité pour la requête ") parser.add_argument("--view", type=str, default="simple", help="type de visualisation. Options possible: simple ou fullText ") parser.add_argument("--vpath", type=str, default="./data/latimes/", help="path des fichier sources pour --view fullText") parser.add_argument("--improvedquery", action='store_true', help="activer recherche de synonymes pour l'amélioration de la requête") args = parser.parse_args() latimes_path = args.d if not args.d.endswith("/"): latimes_path += "/" filemanager = fm.FileManager(args.f, latimes_path) savedVoc = filemanager.read_vocabulary() if args.stemmer: print("Stemmer activated") preprocessor = preprocessing.Preprocessor(True) else : preprocessor = preprocessing.Preprocessor(False) epsilon = 0 switchAlgo = {"naive": naivetopk.apply_naive_top_k_algo, "fagins": faginstopk.apply_top_k_algo, "faginsTA": faginsta.apply_fagins_ta} algoFunct = switchAlgo[args.algo] words = preprocessor.process(args.q) words_request = [] if args.improvedquery: random_indexing = ri.RandomIndexing() for word in words: words_request.append(word) try: synonymes = synknn.get_synonyms( word, 2, random_indexing.getTermDimension(), filemanager) if len(synonymes) == 2: words_request.append(synonymes[1]) except Exception as e: print(e) print("Improved query: {}".format(words_request)) else: words_request = words if (not filemanager.doesUnCompressedVersionExists()) and filemanager.doesCompressedVersionExists(): print("Unzipping in progress…") compressor.decompressZip(filemanager.getPathPLCompressed(),filemanager.getPathPLCompressed()) compressor.decompressZip(filemanager.getPathVocCompressed(),filemanager.getPathVocCompressed()) compressor.decompressZip(filemanager.getPathPLScore(),filemanager.getPathPLScore()) compressor.decompressPLVBYTE(filemanager) result = algoFunct(words_request, savedVoc, filemanager, epsilon, args.n) switchView = {"simple": view.displayResults, "fullText": view.displayResultsText} viewFunct = switchView[args.view] print("\nResults: ") viewFunct(result, args.vpath)
import argparse import wordtraveller.compressor as compressor import wordtraveller.faginstavf as faginsta import wordtraveller.faginstopkvf as faginstopk import wordtraveller.filemanager as fm import wordtraveller.naivetopk as naivetopk import wordtraveller.preprocessing as preprocessing import wordtraveller.randomIndexing as ri import wordtraveller.randomIndexingFindSynonym as synknn import wordtraveller.view as view preprocessor = preprocessing.Preprocessor(True) def analysis_parameters(): parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, default='./workspace/', help="dossier avec les fichier VOC et PL résultat de l'indexation") parser.add_argument("-f", type=str, help="nom de fichier VOC et PL ", required=True) parser.add_argument("-q", type=str, help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True) parser.add_argument("-n", type=int, default=3, help="nombre de résultats souhaité de documents ") parser.add_argument("--stemmer", action='store_true', help="activer le stemming sur les termes de la requête") parser.add_argument("--algo", type=str, default="naive", help="algorithme souhaité pour la requête ") parser.add_argument("--view", type=str, default="simple",
import math import re import time import numpy from pathlib import Path from threading import Thread import nltk from lxml import etree from sortedcontainers import SortedDict import wordtraveller.filemanager as fm import wordtraveller.preprocessing as preprocessing preprocessor = preprocessing.Preprocessor() class AnalyseThread(Thread): def __init__(self, function, chunkpath, voc, randIndexing=None, computeIDF=False, nbDocToStart=0, nbDocToScan=-1): Thread.__init__(self) self.function = function self.chunkpath = chunkpath self.voc = voc self.randIndexing = randIndexing
def analyseAndMergeDocuments(array_of_iterations, stepFlush): path = "" print("analyse_newspaper") print("Merging involved, flush frequency : Every " + str(stepFlush) + " document.") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToMerge = [] timeToFlush = [0] * len(array_of_iterations) timeTotal = [] timeToAnalyse = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 nbDocsInMemory = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsRed = docsRedInDocIteration + nbDocsRed nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead: startFlush = time.time() filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 timeToFlush[numBatch] += (time.time() - startFlush) if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) start = time.time() print("Merging in progress…") filemanager.mergePartialVocsAndPL() timeToMerge.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print(array_of_iterations) print("Ttmerge") print(timeToMerge) plt.plot(array_of_iterations, timeToMerge, label="Time to merge") print("Ttextract") print(timeToExtract) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse document (with flushing)") print("Ttflush") print(timeToFlush) plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents") print("Overalltime") print(timeTotal) plt.plot(array_of_iterations, timeTotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndSaveDocuments(array_of_iterations, computeIDF=False, numberIterations=1): totaltimeToExtract = [] totaltimeToSave = [] totaltimeTotal = [] totaltimeToComputeIDF = [] for i in range(0, numberIterations): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToComputeIDF = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, 0, nbDocsToRead - nbDocsRed) nbDocsRed = docsRedInDocIteration + nbDocsRed if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_iterations) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_iterations, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_iterations, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_iterations, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show() totaltimeToExtract.append(timeToExtract) if computeIDF: totaltimeToComputeIDF.append(timeToComputeIDF) totaltimeToSave.append(timeToSave) totaltimeTotal.append(timeTotal) if computeIDF: print("computeidf") resIDF = [0] * len(totaltimeToComputeIDF[0]) for arr in totaltimeToComputeIDF: for i, elt in enumerate(arr): resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF) print(totaltimeToComputeIDF) print(resIDF) print("extract") resextract = [0] * len(totaltimeToExtract[0]) for arr in totaltimeToExtract: for i, elt in enumerate(arr): resextract[i] = resextract[i] + elt / len(totaltimeToExtract) print(totaltimeToExtract) print(resextract) print("save") ressave = [0] * len(totaltimeToSave[0]) for arr in totaltimeToSave: for i, elt in enumerate(arr): ressave[i] = ressave[i] + elt / len(totaltimeToSave) print(totaltimeToSave) print(ressave) print("total") restotal = [0] * len(totaltimeTotal[0]) for arr in totaltimeTotal: for i, elt in enumerate(arr): restotal[i] = restotal[i] + elt / len(totaltimeTotal) print(totaltimeTotal) print(restotal) plt.plot(array_of_iterations, resextract, label="Time to analyse documents") if computeIDF: plt.plot(array_of_iterations, resIDF, label="Time to compute IDF") plt.plot(array_of_iterations, ressave, label="Time to save") plt.plot(array_of_iterations, restotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToAnalyse = [] timeToComputeIDF = [] for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbNewsPaperToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbNewspaperRed = 0 nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbNewspaperRed >= nbNewsPaperToRead: break docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) nbDocsRed = docsRedInDocIteration + nbDocsRed nbNewspaperRed += 1 if nbNewspaperRed < nbNewsPaperToRead: print("Benchmark invalid, as we ran out of newspaper to read.") timeToExtract.append(time.time() - start) print("We red documents : ") print(nbDocsRed) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_newspapers) plt.plot(array_of_newspapers, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_newspapers, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_newspapers, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_newspapers, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()