Exemplos de Tokenizer.SimpleTokenizer em Python, exemplos de Tokenizer.SimpleTokenizer, fastai em Python

Exemplo n.º 1

0

Exibir arquivo

    def index(self):
        for doi, title, abstract in self.col:
            if self.tokenizerType == '0':  # simple
                tokenizer = Tokenizer.SimpleTokenizer(title, abstract)
            else:  # better
                tokenizer = Tokenizer.BetterTokenizer(title, abstract)

            terms = tokenizer.getTerms()
            for term in terms:
                if term in self.term_map.keys():
                    if doi in self.term_map[term].keys():
                        self.term_map[term][doi] += 1
                    else:
                        self.term_map[term][doi] = 1
                else:
                    term_freq_map = {}  # key: docId, value: term_freq
                    term_freq_map[doi] = 1
                    self.term_map[term] = term_freq_map

Exemplo n.º 2

0

Exibir arquivo

    def index(self):

        self.N = 0

        if self.tokenizerType == '0':  # simple
            tokenizer = Tokenizer.SimpleTokenizer('')
        else:  # better
            tokenizer = Tokenizer.BetterTokenizer('')

        for doi, title, abstract in CorpusReader.CorpusReader(
                self.collectionPath).readCorpus():
            self.N += 1
            tokenizer.changeText(title + " " + abstract)
            terms = tokenizer.getTerms()

            # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} }
            for term in terms:
                if term in self.postingsMaps.keys():
                    if doi in self.postingsMaps[term].keys():
                        self.postingsMaps[term][doi] += 1
                    else:
                        self.postingsMaps[term][doi] = 1
                else:
                    self.postingsMaps[term] = {
                        doi: 1
                    }  # key: docId, value: term_freq

        # lnc (logarithmic term frequency, no document frequency, cosine normalization)
        # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}}
        # logarithmic term frequency
        self.postingsMaps = {
            term: (getIDFt(term, self.postingsMaps, self.N), {
                docId: getLogWeight(term, docId, self.postingsMaps)
                for docId in self.postingsMaps[term].keys()
            })
            for term in self.postingsMaps.keys()
        }

Exemplo n.º 3

0

Exibir arquivo

def main(argv):
    """
    Main script for the discipline's assignment 3. This script is responsable for calling the correct classes and for creating the data flow necessary for querying an existing index.

    :param argv: receives the arguments passed to the program during execution
    :type argv: list<str>

    """

    HELP = """USAGE:\n
    python3 QueryIndex.py [-h] [-o outputFile] [-t tokenizer] [-r limitRAM] [-f feedback] [-s rocchioScope] [-c numChamps] [-l limit] <queryFile> <indexFolder> [a b g] \n
        OPTIONS:
           h - shows this help
           f - tells 
           o - define output file's name
           t - define the tokenizer used for the program
           r - limit program execution to defined RAM capacity
           f - define the feedback used for the Rocchio algorithm
           s - define the number of retrieved documents considered for the Rocchio algorithm
           c - define the size of the champions list
           l - define the number of scores to return
        ARGUMENTS:
           outputFile - actual name for the output file
           tokenizer - must be 'simple' or 'complex' 
           limitRAM - maximum RAM(in Gb) used in the indexing process
           queryFile - name of the file containing 1 or more queries
           indexFolder - name of the folder that contains the indexes
           a - alpha weight for the Rocchio algorithm
           b - beta weight for the Rocchio algorithm
           g - gamma weight for the Rocchio algorithm
           feedback - must be 'user' or 'pseudo'
           rocchioScope - number of retrieved documents considered for the Rocchio algorithm
           numChamps - size of the champions list
           limit - limit number of scores to return"""

    # default variables
    outputFile = "../queryResults/"
    tokenizer = "complex"
    maximumRAM = None
    feedback = None  # None, pseudo or user
    rocchioWeights = []  # alpha, beta and gamma
    n = None  # number of relevant docs (for feedback)
    k = 10000  # champions list size
    limit = 100  # number of scores

    try:
        opts, args = getopt.getopt(argv, "ho:t:r:f:c:s:l:")
    except getopt.GetoptError:
        print(HELP)
        return 1

    if args == [] or (len(args) != 2 and len(args) != 4 and len(args) != 5):
        print(HELP)
        return 2

    # verifies if any option was passed to the script
    for opt, arg in opts:
        if opt == '-h':
            print(HELP)
            return 3
        elif opt == "-o":
            outputFile = arg
        elif opt == "-t":
            assert arg in (
                "simple", "complex"
            ), "Tokenizer option must be either \"simple\" or \"complex\"."
            tokenizer = arg
        elif opt == "-r":
            maxM = psutil.virtual_memory().free
            if arg != "":
                maximumRAM = float(arg) * 1000000000
            else:
                maximumRAM = maxM
            if maximumRAM > maxM:
                maximumRAM = maxM
                print(
                    "Warning: Memory available is less than the asked value, maximumRAM set to "
                    + str(int(maximumRAM / 1000000000)) + "Gb.")
        elif opt == "-f":
            assert arg in (
                "user", "pseudo"
            ), "Feedback option must be either \"user\" or \"pseudo\"."
            feedback = arg
        elif opt == "-c":
            assert int(
                arg) > 0, "Error: numChamps value must be a positive integer"
            k = int(arg)
        elif opt == "-s":
            assert int(
                arg
            ) > 0, "Error: rocchioScope value must be a positive integer"
            n = int(arg)
        elif opt == "-l":
            assert int(
                arg) > 0, "Error: limit value must be a positive integer"
            limit = int(arg)

    if feedback:
        if feedback == "pseudo":
            assert len(
                args
            ) == 4, "Error: if you want to use pseudo feedback, please insert alpha and beta as well"
            rocchioWeights.append(float(args[2]))
            rocchioWeights.append(float(args[3]))
            # rocchioWeights.append(float(args[4]))
        else:
            assert len(
                args
            ) == 5, "Error: if you want to use user feedback, please insert alpha, beta and gamma as well"
            rocchioWeights.append(float(args[2]))
            rocchioWeights.append(float(args[3]))
            rocchioWeights.append(float(args[4]))

    # taking in account the choosen tokenizer, the respective data flow is created
    if tokenizer == "simple":
        assignment3(outputFile, Tokenizer.SimpleTokenizer(), maximumRAM,
                    feedback, n, k, limit, args[0], args[1], rocchioWeights)
    else:  # 'complex' = default tokenizer
        assignment3(outputFile, Tokenizer.ComplexTokenizer(), maximumRAM,
                    feedback, n, k, limit, args[0], args[1], rocchioWeights)

    return 0

Exemplo n.º 4

0

Exibir arquivo

Arquivo: CreateIndex.py Projeto: joao-alegria/RI

def main(argv):
    """
    Main script for the discipline's assignments 1 and 2. This script is responsable for calling the correct classes and for creating the data flow necessary for the index to be created and persisted.

    :param argv: receives the arguments passed to the program during execution
    :type argv: list<str>

    """

    HELP = """USAGE:\n
    python3 CreateIndex.py [-h] [-p] [-w] [-o outputFolder] [-l limit] [-t tokenizer] [-r limitRAM] inputFolder\n
        OPTIONS:
           h - shows this help
           o - define output file's folder
           l - define limit for the number of lines to be processed in each input file
           t - define the tokenizer used for the program
           r - limit program execution to defined RAM capacity
           w - process weights of terms
           p - process positions of terms
        ARGUMENTS:
           outputFolder - actual name for the output folder
           limit - value for the number of lines limit
           tokenizer - must be simple(for the simple 2.1 tokenizer) or complex(for the more advanced 2.2 tokenizer)
           limitRAM - maximum RAM(in Gb) used in the indexing process
           inputFolder - name of the folder that contains the input files to be processed"""

    # default variables
    outputFolder = "index"
    limit = None
    tokenizer = "simple"
    maximumRAM = None
    weightCalc = False
    positionCalc = False
    fileLimit = float("inf")

    try:
        opts, args = getopt.getopt(argv, "wpho:t:l:r:f:")
    except getopt.GetoptError:
        print(HELP)
        return 1

    if args == [] or len(args) != 1:
        print(HELP)
        return 2

    # verifies if any option was passed to the script
    for opt, arg in opts:
        if opt == '-h':
            print(HELP)
            return 3
        elif opt == "-o":
            outputFolder = arg
        elif opt == "-l":
            limit = int(arg)
        elif opt == "-f":
            fileLimit = float(arg) * 1000000000
        elif opt == "-t":
            assert arg in (
                "simple", "complex"
            ), "Tokenizer option must be either \"simple\" or \"complex\"."
            tokenizer = arg
        elif opt == "-w":
            weightCalc = True
        elif opt == "-p":
            positionCalc = True
        elif opt == "-r":
            maxM = psutil.virtual_memory().free
            if arg != "":
                maximumRAM = float(arg) * 1000000000
            else:
                maximumRAM = maxM
            if maximumRAM > maxM:
                maximumRAM = maxM
                print(
                    "Warning: Memory available is less than the asked value, maximumRAM set to "
                    + str(int(maximumRAM / 1000000000)) + "Gb.")

    # taking in account the choosen tokenizer, the respective data flow is created
    if tokenizer == "simple":
        if maximumRAM is None:
            assignment1(Tokenizer.SimpleTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, fileLimit)
        else:
            assignment2(Tokenizer.SimpleTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, maximumRAM, fileLimit)

    else:  # 'complex' = default tokenizer
        if maximumRAM is None:
            assignment1(Tokenizer.ComplexTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, fileLimit)
        else:
            assignment2(Tokenizer.ComplexTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, maximumRAM, fileLimit)

    return 0

Exemplo n.º 5

0

Exibir arquivo

    def index(self):

        start_indexing = timeit.default_timer()
        self.N = 0

        if self.tokenizerType == '0':  # simple
            tokenizer = Tokenizer.SimpleTokenizer('')
        else:  # better
            tokenizer = Tokenizer.BetterTokenizer('')

        corpusReader = CorpusReader.CorpusReader(self.collectionPath)

        print('start memory available: {}%'.format(
            psutil.virtual_memory().available * 100 /
            psutil.virtual_memory().total))

        corpusReader.startReadingCorpus()
        nDicts = 0

        # ------------------------------------------ INDEX WITH TERM POSITIONS -----------------------------------------
        if self.withPositions:

            # ---------------------------------------------- INDEX BLOCKS ----------------------------------------------
            while True:

                # -------------------------------------------- Get Document --------------------------------------------
                doc = corpusReader.readDoc()

                # last document
                if doc == -1:
                    if self.postingsMaps != {}:
                        # start = timeit.default_timer()
                        self.writeIndexToBlockFileWithPositions(
                            './dicts/dict' + str(nDicts))
                        # stop = timeit.default_timer()
                        # print('write: {} seconds'.format(stop - start))
                        # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                        print('available memory: {} %'.format(
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total))

                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary
                    break
                elif doc == None:
                    continue

                (doi, title, abstract) = doc
                del doc
                self.N += 1
                #startdocreadtime = timeit.default_timer()

                # ------------------------------------------- Get Document Terms ---------------------------------------
                tokenizer.changeText(title + " " + abstract)
                del title
                del abstract
                terms, termPositions = tokenizer.getTerms(withPositions=True)
                tokenizer.changeText("")  # clean term memory from tokenizer

                # first, we populate the dictionary postingsMaps with the term positions {term: {docId: [termpositions]} }

                if (psutil.virtual_memory().available * 100 /
                        psutil.virtual_memory().total
                    ) <= 10 and self.postingsMaps != {}:  # available memory
                    #start = timeit.default_timer()
                    self.writeIndexToBlockFileWithPositions('./dicts/dict' +
                                                            str(nDicts))
                    #stop = timeit.default_timer()
                    #print('write: {} seconds'.format(stop - start))
                    #print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                    print('available memory: {} %'.format(
                        psutil.virtual_memory().available * 100 /
                        psutil.virtual_memory().total))

                    nDicts += 1
                    self.postingsMaps = {}  # clean dictionary
                else:
                    #while terms != [] and termPositions != []:
                    #    if terms[0] in self.postingsMaps.keys():
                    #        #if doi not in self.postingsMaps[terms[0]].keys(): -> doi always not in self.postingsMaps[terms[0]].keys()
                    #        self.postingsMaps[terms[0]][doi] = termPositions[0]
                    #    else:
                    #        self.postingsMaps[terms[0]] = {doi: termPositions[0]}  # key: docId, value: [pos1,pos2,pos3,...]
                    #
                    #    terms = terms[1:]
                    #    termPositions = termPositions[1:]

                    _ = [
                        self.postingsMaps.update({
                            terms[termInd]: {
                                doi: termPositions[termInd]
                            }
                        }) if terms[termInd] not in self.postingsMaps.keys()
                        else self.postingsMaps[terms[termInd]].update(
                            {doi: termPositions[termInd]})
                        for termInd in range(len(terms))
                    ]

                    del terms
                    del termPositions

                #enddocreadtime = timeit.default_timer()
                #print('document {}: {} seconds'.format(doi, enddocreadtime - startdocreadtime))

            # ---------------------------------------- ENDED INDEXING BLOCKS -------------------------------------------
            stop_indexing = timeit.default_timer()
            print('indexing into blocks: {} minutes and {} seconds'.format(
                (stop_indexing - start_indexing) // 60,
                (stop_indexing - start_indexing) % 60))

            start = timeit.default_timer()
            if os.path.isfile("index"):
                os.remove("index")

            final_dict = open("index", "w")
            dict_names = [
                './dicts/dict' + str(nDict) for nDict in range(nDicts)
            ]

            # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------
            print('merging dictionary fase and writting index to disk')

            temp_dicts = [open(dict_name, "r") for dict_name in dict_names]
            ntermsToDisk = 0
            while temp_dicts != []:
                for dict_file in temp_dicts:
                    # ---------------------- Read first line of each file ------------------------------------------
                    line = dict_file.readline()

                    if not line:
                        #print('file: {}, temp_dicts: {}'.format(dict_file, temp_dicts))
                        dict_file.close()
                        # delete dictionary block from disk
                        os.remove(dict_names[temp_dicts.index(dict_file)])
                        dict_names.remove(
                            dict_names[temp_dicts.index(dict_file)])
                        temp_dicts.remove(dict_file)
                        continue

                    # ------------------------ Save line info to memory --------------------------------------------
                    info = line.split(
                        '|'
                    )  # 'term', 'docid', 'pos1,pos2,pos3', 'docid', 'pos1,pos2,pos3', ...
                    info.remove('\n')
                    while '' in info:
                        info.remove('')
                    term = info[0]  # term
                    #print('term: {}'.format(term))
                    docIds = info[1:][0::2]  # [docid, docid, ...]
                    #print('docIds: {}'.format(docIds))
                    termPositions = [
                        positions.split(',') for positions in info[1:][1::2]
                    ]  # [[pos1,pos2,pos3], [pos1,pos2,pos3], ...]
                    #print('termPositions: {}'.format(termPositions))
                    #print('postingsMaps: {}'.format(list(self.postingsMaps.items())))
                    if term in self.postingsMaps.keys():
                        #for docId in docIds:
                        # if docId in line_temp_dict[term].keys(): -> doesnt happpen because we only write to file after reading the whole document
                        # merge postings list (update in order if dict document)
                        self.postingsMaps[term].update({
                            docIds[docInd]: termPositions[docInd]
                            for docInd in range(len(docIds))
                        })
                    else:
                        self.postingsMaps.update({
                            term: {
                                docIds[docInd]: termPositions[docInd]
                                for docInd in range(len(docIds))
                            }
                        })

                if self.postingsMaps != {}:
                    # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX -----------------------------
                    # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure
                    # get first element of alphabetical sorted list of terms in memory
                    minorTerm = sorted(self.postingsMaps.keys())[0]
                    #print('[\'-Complex@ZIF-67:qgdvdy3k:1|gltf4m6w:1|\n:5.422985219043376|\n\']')
                    #print('term: ' + minorTerm)
                    #print('idf: ' + str(getIDFt(minorTerm, self.postingsMaps, self.N)))
                    #print('doc_ids: ' + ''.join([str(doc_id) for doc_id, positions in self.postingsMaps[minorTerm].items()]))
                    #print('LogWeightPositions: ' + ''.join([str(getLogWeightPositions(minorTerm, doc_id, self.postingsMaps)) for doc_id, positions in self.postingsMaps[minorTerm].items()]))
                    #print('positions: ' + ','.join([','.join([str(pos) for pos in positions]) for doc_id, positions in self.postingsMaps[minorTerm].items()]))

                    # write its information to the final dictionary\
                    final_dict.writelines([
                        minorTerm + ':' +  # term:
                        str(getIDFt(minorTerm, self.postingsMaps, self.N)) +
                        '|' +  # idf|
                        '|'.join([
                            str(doc_id) + ':' +  # doc_id:
                            str(
                                getLogWeightPositions(minorTerm, doc_id,
                                                      self.postingsMaps)) +
                            ':' +  # term_weight:
                            ','.join([str(pos)
                                      for pos in positions])  # pos1,pos2,...
                            for doc_id, positions in
                            self.postingsMaps[minorTerm].items()
                        ]) + '\n'
                    ])

                    ntermsToDisk += 1
                    #print('merging dictionary fase: writed into disk {} terms'.format(ntermsToDisk))

                    # remove it from memory
                    del self.postingsMaps[minorTerm]

            # ---------------------------------------- ENDED MERGING INDEX BLOCKS --------------------------------------
            del info
            del term
            del docIds
            del termPositions
            del minorTerm

            final_dict.close()

            stop = timeit.default_timer()
            print(
                'merge and write of final dictionary: {} minutes and {} seconds'
                .format((stop - start) // 60, (stop - start) % 60))

        # ----------------------------------------- INDEX WITHOUT TERM POSITIONS ---------------------------------------
        else:
            # ---------------------------------------------- INDEX BLOCKS ----------------------------------------------
            while True:
                doc = corpusReader.readDoc()
                # last document
                if doc == -1:
                    if self.postingsMaps != {}:
                        # start = timeit.default_timer()
                        self.writeIndexToBlockFile('./dicts/dict' +
                                                   str(nDicts))
                        # stop = timeit.default_timer()
                        # print('write: {} seconds'.format(stop - start))
                        # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                        print('available memory: {} %'.format(
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total))

                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary
                    break
                elif doc == None:
                    continue

                (doi, title, abstract) = doc
                del doc
                self.N += 1

                # ------------------------------------------- Get Document Terms ---------------------------------------
                tokenizer.changeText(title + " " + abstract)
                del title
                del abstract
                terms = tokenizer.getTerms(withPositions=False)
                tokenizer.changeText("")  # clean term memory from tokenizer

                # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} }
                nDicts = 0

                for term in terms:
                    if (
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total
                    ) <= 10 and self.postingsMaps != {}:  # available memory
                        # lnc (logarithmic term frequency, no document frequency, cosine normalization)
                        # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}}
                        # logarithmic term frequency
                        self.writeIndexToBlockFile('./dicts/dict' +
                                                   str(nDicts))
                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary

                    if term in self.postingsMaps.keys():
                        if doi in self.postingsMaps[term].keys():
                            self.postingsMaps[term][doi] += 1
                        else:
                            self.postingsMaps[term][doi] = 1
                    else:
                        self.postingsMaps[term] = {
                            doi: 1
                        }  # key: docId, value: term_freq

            # ---------------------------------------- ENDED INDEXING BLOCKS -------------------------------------------
            start = timeit.default_timer()
            if os.path.isfile("index"):
                os.remove("index")
            final_dict = open("index", "w")
            dict_names = [
                './dicts/dict' + str(nDict) for nDict in range(nDicts)
            ]

            # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------
            temp_dicts = [open(dict_name, "r") for dict_name in dict_names]
            while temp_dicts != []:
                for dict_file in temp_dicts:
                    # ---------------------- Read first line of each file ------------------------------------------
                    line = dict_file.readline()

                    if not line:
                        dict_file.close()
                        # delete dictionary block from disk
                        os.remove(dict_names[temp_dicts.index(dict_file)])
                        dict_names.remove(
                            dict_names[temp_dicts.index(dict_file)])
                        temp_dicts.remove(dict_file)
                        continue

                    # ------------------------ Save line info to memory --------------------------------------------
                    info = line.split(
                        '|'
                    )  # 'term', 'docid', 'term_freq', 'docid', 'term_freq', ...
                    info.remove('\n')
                    while '' in info:
                        info.remove('')
                    term = info[0]  # term
                    docIds = info[1:][0::2]  # [docid, docid, ...]
                    termFreqs = info[1:][1::2]  # [term_freq, term_freq, ...]

                    if term in self.postingsMaps.keys():
                        self.postingsMaps[term].update({
                            docIds[docInd]: termFreqs[docInd]
                            for docInd in range(len(docIds))
                        })
                    else:
                        self.postingsMaps.update({
                            term: {
                                docIds[docInd]: termFreqs[docInd]
                                for docInd in range(len(docIds))
                            }
                        })

                if self.postingsMaps != {}:

                    # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX -----------------------------
                    # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure
                    # get first element of alphabetical sorted list of terms in memory
                    minorTerm = sorted(self.postingsMaps.keys())[0]

                    # write its information to the final dictionary
                    final_dict.writelines([
                        minorTerm + ':' +  # term:
                        str(getIDFt(minorTerm, self.postingsMaps, self.N)) +
                        '|' +  # idf|
                        '|'.join([
                            str(doc_id) + ':' +  # doc_id:
                            str(
                                getLogWeight(minorTerm, doc_id, self.
                                             postingsMaps))  # term_weight|
                            for doc_id, positions in
                            self.postingsMaps[minorTerm].items()
                        ]) + '\n'
                    ])

                    # remove it from memory
                    del self.postingsMaps[minorTerm]

            # ---------------------------------------- ENDED MERGING INDEX BLOCKS --------------------------------------
            del info
            del term
            del docIds
            del termFreqs
            del minorTerm

            final_dict.close()

            stop = timeit.default_timer()
            print(
                'merge and write of final dictionary: {} minutes and {} seconds'
                .format((stop - start) // 60, (stop - start) % 60))