Exemplo n.º 1
0
def assignment2(tokenizer, outputFolder, inputFolder, limit, weightCalc,
                positionCalc, maximumRAM, fileLimit):
    """
    Follows the execution flow specific for the second assignment.

    :param tokenizer: class instance to be used in the tokenization process
    :type tokenizer: Tokenizer
    :param outputFolder: name of the folder where the final index will be written to
    :type outputFolder: str
    :param inputFolder: list of one element representing the name of the folder that contains the files with the textual information to be indexed
    :type inputFolder: list<str>
    :param limit: limit number of documents to have in consideration, None if no limit
    :type limit: int
    :param weightCalc: True if the term weights are to be calculated, False if not
    :type weightCalc: bool
    :param positionCalc: True if the term positions are to be calculated, False if not
    :type positionCalc: bool
    :param maximumRAM: maximum amount of RAM (in Gb) allowed for the program execution
    :type maximumRAM: int

    """

    parser = FileParser.LimitedRamFileParser(inputFolder, limit)

    indexer = Indexer.FileIndexer(tokenizer, positionCalc, weightCalc)
    if weightCalc and positionCalc:
        persister = PersistIndex.PersistCSVWeightedPosition(
            outputFolder, fileLimit, indexer)
    elif weightCalc:
        persister = PersistIndex.PersistCSVWeighted(outputFolder, fileLimit,
                                                    indexer)
    elif positionCalc:
        persister = PersistIndex.PersistCSVPosition(outputFolder, fileLimit,
                                                    indexer)
    else:
        persister = PersistIndex.PersistCSV(outputFolder, fileLimit, indexer)

    auxFile = "intermediate_index_{0}.txt"
    blockCounter = 1

    # getContent() in this context(with LimitedParser) returns 1 document
    runSPIMI = True
    while (runSPIMI):
        while (isMemoryAvailable(maximumRAM)):
            doc = parser.getContent()
            if not doc:
                runSPIMI = False
                break
            indexer.createIndex(doc)

        persister.setTotalNumDocs(parser.numDocs)
        persister.persistTranslations(
            sorted(indexer.translation, key=lambda tup: tup[0]))
        if len(indexer.bestTerms.keys()) > 0:
            for key in indexer.bestTerms.keys():
                persister.persistCache(key, indexer.bestTerms[key])
        if not runSPIMI and blockCounter == 1:
            persister.persist(indexer.index)
            return 0
        else:
            if persister.persist(indexer.index, auxFile.format(blockCounter)):
                blockCounter += 1
        indexer.clearVar()
        persister.clearVar()
        tokenizer.clearTokens()
        gc.collect()

    if weightCalc and positionCalc:
        merger = Merger.PositionWeightMerger(
            [auxFile.format(x) for x in range(1, blockCounter)],
            parser.numDocs, outputFolder, fileLimit)
    elif weightCalc:
        merger = Merger.WeightMerger(
            [auxFile.format(x) for x in range(1, blockCounter)],
            parser.numDocs, outputFolder, fileLimit)
    elif positionCalc:
        merger = Merger.PositionMerger(
            [auxFile.format(x) for x in range(1, blockCounter)],
            parser.numDocs, outputFolder, fileLimit)
    else:
        merger = Merger.SimpleMerger(
            [auxFile.format(x) for x in range(1, blockCounter)],
            parser.numDocs, outputFolder, fileLimit)

    # merging intermediateIndexes
    tokenizer.clearVar()
    parser.clearVar()
    indexer.clearVar()
    persister.clearVar()
    del parser
    del tokenizer
    del persister

    runSPIMI = True
    allDone = False
    print("Merging...")
    while (runSPIMI):
        while not allDone and isMemoryAvailable(maximumRAM):
            allDone = merger.mergeIndex()
            if allDone:
                runSPIMI = False
                merger.writeIndex()
                break
        merger.writeIndex()
        gc.collect()

    del merger
Exemplo n.º 2
0
def assignment1(tokenizer, outputFolder, inputFolder, limit, weightCalc,
                positionCalc, fileLimit):
    """
    Follows the execution flow specific for the first assignment.

    :param tokenizer: class instance to be used in the tokenization process
    :type tokenizer: Tokenizer
    :param outputFolder: name of the folder where the final index will be written to
    :type outputFolder: str
    :param inputFiles: list of the names of the files containing the textual information to be indexed
    :type inputFiles: list<str>
    :param limit: limit number of documents to have in consideration, None if no limit
    :type limit: int
    :param weightCalc: True if the term weights are to be calculated, False if not
    :type weightCalc: bool
    :param positionCalc: True if the term positions are to be calculated, False if not
    :type positionCalc: bool

    """

    parser = FileParser.LimitedRamFileParser(inputFolder, limit)
    indexer = Indexer.FileIndexer(tokenizer, positionCalc, weightCalc, parser)

    run = True
    while (run):
        doc = parser.getContent()
        if not doc:
            run = False
            break
        indexer.createIndex(doc)

    if weightCalc and positionCalc:
        persister = PersistIndex.PersistCSVWeightedPosition(
            outputFolder, fileLimit, indexer, parser.numDocs)
        persister.persist()
    elif weightCalc:
        persister = PersistIndex.PersistCSVWeighted(outputFolder, fileLimit,
                                                    indexer, parser.numDocs)
        persister.persist()
    elif positionCalc:
        persister = PersistIndex.PersistCSVPosition(outputFolder, fileLimit,
                                                    indexer, parser.numDocs)
        persister.persist()
    else:
        persister = PersistIndex.PersistCSV(outputFolder, fileLimit, indexer,
                                            parser.numDocs)
        persister.persist()

    if len(indexer.bestTerms.keys()) > 0:
        for key in indexer.bestTerms.keys():
            persister.persistCache(key, indexer.bestTerms[key])
    persister.persistTranslations(
        sorted(indexer.translation, key=lambda tup: tup[0]))

    tokenizer.clearVar()
    parser.clearVar()
    indexer.clearVar()
    persister.clearVar()
    del parser
    del indexer
    del tokenizer
    del persister
    gc.collect()