Пример #1
0
def read_source_file(path):
    """ Read the raw newspaper data from ./data/latimes/ and return the content
    of the p tags (after preprocessing)
    Postcondition : Returns a list of words foundt in ./data/latimes
    """
    preprocessor = preprocessing.Preprocessor(True)
    pathlist = Path(path).glob('**/la*')
    terms = []
    for i, newspaper_path in enumerate(pathlist):
        raw = newspaper_path.read_text()
        tree = etree.fromstring("<NEWSPAPER>" + raw + "</NEWSPAPER>")
        for document in tree.xpath("//DOC"):
            text = ""
            for p in document.xpath("./TEXT/P"):
                text += p.text
            terms[len(terms):len(terms)] = preprocessor.process(text)
    return terms
Пример #2
0
def analyse(nbNewspaper,
            path="./latimes/",
            flushEvery=1,
            analysisApproach=analysis.analyse_newspaper,
            mergeInTheEnd=True,
            useStemmer=True,
            sizeDocument=medium):
    """
  This benchmark will analyse documents, put the VOC and PL in memory
  and eventually flush it to the hardrive if requested. 
  In the end, a VOC and PL file will be created on the hardrive
  
  nbNewspaper is the number of newspaper we will go through in path
  path is the path to the directory
  flushEvery is the frequency of flush. (-1 if we never flush)
  mergeInTheEnd : if false, no merge in the end is proceeded and vocabulary is reset at the end of each loop
  """
    pathlist = Path(path).glob('**/la*')
    vocabulary = SortedDict()
    filemanager = fm.FileManager("benchmarkAnalysisTest")
    flushCounter = 0
    tmpPreprocessor = analysis.preprocessor
    if not useStemmer:
        analysis.setPreprocessor(
            preprocessing.Preprocessor(activateStemmer=False))
    for i, newspaper_path in enumerate(pathlist):
        if i >= nbNewspaper:
            break

        flushCounter += 1
        analysisApproach(newspaper_path, vocabulary, False)
        if mergeInTheEnd == False:
            vocabulary = SortedDict()
            continue
        if flushCounter >= flushEvery and flushEvery != 1:
            flushCounter = 0
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)
            vocabulary = SortedDict()
    if mergeInTheEnd:
        filemanager.mergePartialVocsAndPL()
    analysis.setPreprocessor(tmpPreprocessor)
Пример #3
0
def analysis_parameters():
    global MAX_RANDOM_INDEXING

    parser = argparse.ArgumentParser()

    parser.add_argument("-d",
                        type=str,
                        help="dossier avec les documents",
                        required=True)
    parser.add_argument(
        "-f",
        type=str,
        help="nom de fichier pour enregistrer les fichiers après l'indexation ",
        required=True)
    parser.add_argument(
        "-o",
        type=str,
        default='./workspace/',
        help="dossier pour enregistrer les fichiers après l'indexation ")
    parser.add_argument("--zip",
                        action='store_true',
                        help="compression zip à la fin")
    parser.add_argument(
        "--partial",
        type=int,
        default=-1,
        help=
        'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.'
    )
    parser.add_argument("--stemmer",
                        action='store_true',
                        help='activer stemmer')
    parser.add_argument("--randomindexing",
                        action='store_true',
                        help='activer random indexing')
    args = parser.parse_args()

    latimes_path = args.d
    if not args.d.endswith("/"):
        latimes_path += "/"

    workspace_path = args.o
    if not args.d.endswith("/"):
        workspace_path += "/"

    pathlist = Path(latimes_path).glob('**/la*')

    vocabulary = dict()
    filemanager = fm.FileManager(args.f, workspace_path)
    random_indexing = None
    if args.randomindexing:
        random_indexing = ri.RandomIndexing()

    if args.stemmer:
        analysis.setPreprocessor(preprocessing.Preprocessor(True))

    if args.partial == -2:
        print("Partial analysis in progress")
        for newspaper_path in tqdm(list(pathlist)):
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)
            vocabulary = dict()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
    if args.partial != -1:
        nbDocsInMemory = 0
        stepFlush = args.partial

        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):

            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                if rand_indexing_counter < MAX_RANDOM_INDEXING:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, random_indexing, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                else:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory >= stepFlush:
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                rand_indexing_counter += 1

        if nbDocsInMemory != 0:
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)

        print("Merging in progress…")

        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
        print("Inverted file created !")

    else:
        print("Non partial")
        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):
            if rand_indexing_counter < MAX_RANDOM_INDEXING:
                rand_indexing_counter += 1
                analysis.analyse_newspaper(newspaper_path, vocabulary,
                                           random_indexing, False)
            else:
                analysis.analyse_newspaper(newspaper_path, vocabulary, None,
                                           False)
        analysis.computeIDF(vocabulary)
        filemanager.save_vocabularyAndPL_file(vocabulary)

        print("Inverted file created !")

    if args.zip:

        print("Compressing…")
        filemanager = fm.FileManager(args.f, args.o)

        zip.compressPLVBYTEFromSavedVocAndPL(filemanager)

        zip.compressZip(filemanager.getPathPLCompressed())

        zip.compressZip(filemanager.getPathVocCompressed())

        zip.compressZip(filemanager.getPathPLScore())

        print("Compressed !")

    if args.randomindexing:
        filemanager.save_random_indexing(random_indexing.getTermsVectors(),
                                         random_indexing.getTermDimension())
        print("Random indexing created")
                        help='activer stemmer')
    parser.add_argument("-n",
                        type=str,
                        required=True,
                        help='nombre de synonymes pour la requête')

    args = parser.parse_args()

    workspace_path = args.d
    if not args.d.endswith("/"):
        workspace_path += "/"

    random_indexing = ri.RandomIndexing()
    filemanager = fm.FileManager(args.f, workspace_path)

    ri_term, ri_voc = filemanager.read_random_indexing(
        random_indexing.getTermDimension())
    if args.stemmer:
        preprocessor = pp.Preprocessor(True)
    else:
        preprocessor = pp.Preprocessor(False)

    stemmed = preprocessor.process(args.t)
    try:
        indexToSearch = ri_term.index(stemmed[0])
        print("Synonymes for : {} ".format(ri_term[indexToSearch]))
        res = classify(ri_voc[indexToSearch], ri_voc, int(args.n))
        for i, term_index in enumerate(res):
            print("{:<3} : {}".format(i, ri_term[term_index]))
    except ValueError as e:
        print(args.t + ' is not in the indexed list')
Пример #5
0
def analysis_parameters():
    parser = argparse.ArgumentParser()

    parser.add_argument("-d", type=str, default='./workspace/',
                        help="dossier avec les fichier VOC et PL résultat de l'indexation")
    parser.add_argument("-f", type=str,
                        help="nom de fichier VOC et PL ", required=True)
    parser.add_argument("-q", type=str,
                        help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True)
    parser.add_argument("-n", type=int, default=3,
                        help="nombre de résultats souhaité de documents ")
    parser.add_argument("--stemmer", action='store_true',
                        help="activer le stemming sur les termes de la requête")
    parser.add_argument("--algo", type=str, default="naive",
                        help="algorithme souhaité pour la requête ")
    parser.add_argument("--view", type=str, default="simple",
                        help="type de visualisation. Options possible: simple ou fullText ")
    parser.add_argument("--vpath", type=str, default="./data/latimes/",
                        help="path des fichier sources pour --view fullText")
    parser.add_argument("--improvedquery", action='store_true',
                        help="activer recherche de synonymes pour l'amélioration de la requête")

    args = parser.parse_args()
    latimes_path = args.d
    if not args.d.endswith("/"):
        latimes_path += "/"
    filemanager = fm.FileManager(args.f, latimes_path)
    savedVoc = filemanager.read_vocabulary()
    if args.stemmer:
        print("Stemmer activated")
        preprocessor = preprocessing.Preprocessor(True)
    else :
        preprocessor = preprocessing.Preprocessor(False)
    epsilon = 0

    switchAlgo = {"naive": naivetopk.apply_naive_top_k_algo,
                  "fagins": faginstopk.apply_top_k_algo,
                  "faginsTA": faginsta.apply_fagins_ta}

    algoFunct = switchAlgo[args.algo]

    words = preprocessor.process(args.q)
    words_request = []
    if args.improvedquery:
        random_indexing = ri.RandomIndexing()
        for word in words:
            words_request.append(word)

            try:
                synonymes = synknn.get_synonyms(
                    word, 2, random_indexing.getTermDimension(), filemanager)
                if len(synonymes) == 2:
                    words_request.append(synonymes[1])
            except Exception as e:
                print(e)
        print("Improved query: {}".format(words_request))
    else:
        words_request = words

    if (not filemanager.doesUnCompressedVersionExists()) and filemanager.doesCompressedVersionExists():
        print("Unzipping in progress…")
        compressor.decompressZip(filemanager.getPathPLCompressed(),filemanager.getPathPLCompressed())
        compressor.decompressZip(filemanager.getPathVocCompressed(),filemanager.getPathVocCompressed())
        compressor.decompressZip(filemanager.getPathPLScore(),filemanager.getPathPLScore())
        compressor.decompressPLVBYTE(filemanager)

    result = algoFunct(words_request, savedVoc, filemanager, epsilon, args.n)

    switchView = {"simple": view.displayResults,
                  "fullText": view.displayResultsText}
    viewFunct = switchView[args.view]
    print("\nResults: ")
    viewFunct(result, args.vpath)
Пример #6
0
import argparse

import wordtraveller.compressor as compressor
import wordtraveller.faginstavf as faginsta
import wordtraveller.faginstopkvf as faginstopk
import wordtraveller.filemanager as fm
import wordtraveller.naivetopk as naivetopk
import wordtraveller.preprocessing as preprocessing
import wordtraveller.randomIndexing as ri
import wordtraveller.randomIndexingFindSynonym as synknn
import wordtraveller.view as view

preprocessor = preprocessing.Preprocessor(True)


def analysis_parameters():
    parser = argparse.ArgumentParser()

    parser.add_argument("-d", type=str, default='./workspace/',
                        help="dossier avec les fichier VOC et PL résultat de l'indexation")
    parser.add_argument("-f", type=str,
                        help="nom de fichier VOC et PL ", required=True)
    parser.add_argument("-q", type=str,
                        help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True)
    parser.add_argument("-n", type=int, default=3,
                        help="nombre de résultats souhaité de documents ")
    parser.add_argument("--stemmer", action='store_true',
                        help="activer le stemming sur les termes de la requête")
    parser.add_argument("--algo", type=str, default="naive",
                        help="algorithme souhaité pour la requête ")
    parser.add_argument("--view", type=str, default="simple",
Пример #7
0
import math
import re
import time
import numpy
from pathlib import Path
from threading import Thread

import nltk
from lxml import etree
from sortedcontainers import SortedDict

import wordtraveller.filemanager as fm
import wordtraveller.preprocessing as preprocessing

preprocessor = preprocessing.Preprocessor()


class AnalyseThread(Thread):
    def __init__(self,
                 function,
                 chunkpath,
                 voc,
                 randIndexing=None,
                 computeIDF=False,
                 nbDocToStart=0,
                 nbDocToScan=-1):
        Thread.__init__(self)
        self.function = function
        self.chunkpath = chunkpath
        self.voc = voc
        self.randIndexing = randIndexing
Пример #8
0
def analyseAndMergeDocuments(array_of_iterations, stepFlush):
    path = ""
    print("analyse_newspaper")
    print("Merging involved, flush frequency : Every " + str(stepFlush) +
          " document.")
    pathlist = Path("./../data/latimes/").glob('**/la*')

    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToMerge = []
    timeToFlush = [0] * len(array_of_iterations)
    timeTotal = []
    timeToAnalyse = []
    for numBatch, nbDocsToRead in enumerate(array_of_iterations):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbDocsToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbDocsRed = 0
        nbDocsInMemory = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbDocsRed >= nbDocsToRead:
                break
            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                docsRedInDocIteration = analysis.analyse_newspaper(
                    newspaper_path, vocabulary, None, False,
                    nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush)
                nbDocsRed = docsRedInDocIteration + nbDocsRed
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead:
                    startFlush = time.time()
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                    timeToFlush[numBatch] += (time.time() - startFlush)
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed >= nbDocsToRead:
                break
        if nbDocsRed < nbDocsToRead:
            print("Benchmark invalid, as we ran out of documents to read.")
        timeToExtract.append(time.time() - start)
        start = time.time()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        timeToMerge.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print(array_of_iterations)
    print("Ttmerge")
    print(timeToMerge)
    plt.plot(array_of_iterations, timeToMerge, label="Time to merge")
    print("Ttextract")
    print(timeToExtract)
    plt.plot(array_of_iterations,
             timeToExtract,
             label="Time to analyse document (with flushing)")
    print("Ttflush")
    print(timeToFlush)
    plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents")
    print("Overalltime")
    print(timeTotal)
    plt.plot(array_of_iterations, timeTotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Пример #9
0
def analyseAndSaveDocuments(array_of_iterations,
                            computeIDF=False,
                            numberIterations=1):
    totaltimeToExtract = []
    totaltimeToSave = []
    totaltimeTotal = []
    totaltimeToComputeIDF = []
    for i in range(0, numberIterations):
        path = ""
        print("analyse_newspaper")
        print("Save only in the end, no merging involved")
        pathlist = Path("./../data/latimes/").glob('**/la*')

        tmpPreprocessor = analysis.preprocessor
        analysis.setPreprocessor(
            preprocessing.Preprocessor(activate_stemmer=False))
        timeToExtract = []
        timeToSave = []
        timeTotal = []
        timeToComputeIDF = []
        for numBatch, nbDocsToRead in enumerate(array_of_iterations):
            startBatch = time.time()
            folder = './workspace/'
            for the_file in os.listdir(folder):
                file_path = os.path.join(folder, the_file)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except Exception as e:
                    print(e)
            filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                         str(nbDocsToRead))
            start = time.time()
            pathlist = Path("./../data/latimes/").glob('**/la*')
            vocabulary = dict()
            nbDocsRed = 0
            print("analysis in progress")
            for i, newspaper_path in enumerate(pathlist):

                if nbDocsRed >= nbDocsToRead:
                    break
                docsRedInDocIteration = -1

                while (docsRedInDocIteration != 0):
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False, 0,
                        nbDocsToRead - nbDocsRed)
                    nbDocsRed = docsRedInDocIteration + nbDocsRed

                    if nbDocsRed >= nbDocsToRead:
                        break
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed < nbDocsToRead:
                print("Benchmark invalid, as we ran out of documents to read.")
            timeToExtract.append(time.time() - start)
            if computeIDF:
                startComputeIDF = time.time()
                analysis.computeIDF(vocabulary)
                timeToComputeIDF.append(time.time() - startComputeIDF)
            start = time.time()
            print("Saving in progress…")
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
            timeToSave.append(time.time() - start)
            timeTotal.append(time.time() - startBatch)

        analysis.setPreprocessor(tmpPreprocessor)
        print("Number of documents :")
        print(array_of_iterations)
        plt.plot(array_of_iterations,
                 timeToExtract,
                 label="Time to analyse documents")
        print("Time to extract :")
        print(timeToExtract)

        if computeIDF:
            plt.plot(array_of_iterations,
                     timeToComputeIDF,
                     label="Time to compute IDF")

            print("Time to compute IDF :")
            print(timeToComputeIDF)

        plt.plot(array_of_iterations, timeToSave, label="Time to save")
        print("Time to save :")
        print(timeToSave)
        plt.plot(array_of_iterations, timeTotal, label="Overall time")
        print("Overall Time :")
        print(timeTotal)
        plt.xlabel("Number of Documents")
        plt.ylabel("Time (s)")
        plt.legend()
        plt.show()
        totaltimeToExtract.append(timeToExtract)
        if computeIDF:
            totaltimeToComputeIDF.append(timeToComputeIDF)
        totaltimeToSave.append(timeToSave)
        totaltimeTotal.append(timeTotal)

    if computeIDF:
        print("computeidf")
        resIDF = [0] * len(totaltimeToComputeIDF[0])
        for arr in totaltimeToComputeIDF:
            for i, elt in enumerate(arr):
                resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF)

        print(totaltimeToComputeIDF)
        print(resIDF)
    print("extract")
    resextract = [0] * len(totaltimeToExtract[0])
    for arr in totaltimeToExtract:
        for i, elt in enumerate(arr):
            resextract[i] = resextract[i] + elt / len(totaltimeToExtract)
    print(totaltimeToExtract)
    print(resextract)
    print("save")
    ressave = [0] * len(totaltimeToSave[0])
    for arr in totaltimeToSave:
        for i, elt in enumerate(arr):
            ressave[i] = ressave[i] + elt / len(totaltimeToSave)
    print(totaltimeToSave)
    print(ressave)
    print("total")
    restotal = [0] * len(totaltimeTotal[0])
    for arr in totaltimeTotal:
        for i, elt in enumerate(arr):
            restotal[i] = restotal[i] + elt / len(totaltimeTotal)
    print(totaltimeTotal)
    print(restotal)
    plt.plot(array_of_iterations,
             resextract,
             label="Time to analyse documents")
    if computeIDF:
        plt.plot(array_of_iterations, resIDF, label="Time to compute IDF")
    plt.plot(array_of_iterations, ressave, label="Time to save")
    plt.plot(array_of_iterations, restotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Пример #10
0
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False):
    path = ""
    print("analyse_newspaper")
    print("Save only in the end, no merging involved")

    pathlist = Path("./../data/latimes/").glob('**/la*')
    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToSave = []
    timeTotal = []
    timeToAnalyse = []
    timeToComputeIDF = []
    for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbNewsPaperToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbNewspaperRed = 0
        nbDocsRed = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbNewspaperRed >= nbNewsPaperToRead:
                break
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            nbDocsRed = docsRedInDocIteration + nbDocsRed
            nbNewspaperRed += 1

        if nbNewspaperRed < nbNewsPaperToRead:
            print("Benchmark invalid, as we ran out of newspaper to read.")
        timeToExtract.append(time.time() - start)
        print("We red documents : ")
        print(nbDocsRed)
        if computeIDF:
            startComputeIDF = time.time()
            analysis.computeIDF(vocabulary)
            timeToComputeIDF.append(time.time() - startComputeIDF)
        start = time.time()
        print("Saving in progress…")
        filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
        timeToSave.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print("Number of documents :")
    print(array_of_newspapers)
    plt.plot(array_of_newspapers,
             timeToExtract,
             label="Time to analyse documents")
    print("Time to extract :")
    print(timeToExtract)

    if computeIDF:
        plt.plot(array_of_newspapers,
                 timeToComputeIDF,
                 label="Time to compute IDF")

        print("Time to compute IDF :")
        print(timeToComputeIDF)

    plt.plot(array_of_newspapers, timeToSave, label="Time to save")
    print("Time to save :")
    print(timeToSave)
    plt.plot(array_of_newspapers, timeTotal, label="Overall time")
    print("Overall Time :")
    print(timeTotal)
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()