def assignment2(tokenizer, outputFolder, inputFolder, limit, weightCalc, positionCalc, maximumRAM, fileLimit): """ Follows the execution flow specific for the second assignment. :param tokenizer: class instance to be used in the tokenization process :type tokenizer: Tokenizer :param outputFolder: name of the folder where the final index will be written to :type outputFolder: str :param inputFolder: list of one element representing the name of the folder that contains the files with the textual information to be indexed :type inputFolder: list<str> :param limit: limit number of documents to have in consideration, None if no limit :type limit: int :param weightCalc: True if the term weights are to be calculated, False if not :type weightCalc: bool :param positionCalc: True if the term positions are to be calculated, False if not :type positionCalc: bool :param maximumRAM: maximum amount of RAM (in Gb) allowed for the program execution :type maximumRAM: int """ parser = FileParser.LimitedRamFileParser(inputFolder, limit) indexer = Indexer.FileIndexer(tokenizer, positionCalc, weightCalc) if weightCalc and positionCalc: persister = PersistIndex.PersistCSVWeightedPosition( outputFolder, fileLimit, indexer) elif weightCalc: persister = PersistIndex.PersistCSVWeighted(outputFolder, fileLimit, indexer) elif positionCalc: persister = PersistIndex.PersistCSVPosition(outputFolder, fileLimit, indexer) else: persister = PersistIndex.PersistCSV(outputFolder, fileLimit, indexer) auxFile = "intermediate_index_{0}.txt" blockCounter = 1 # getContent() in this context(with LimitedParser) returns 1 document runSPIMI = True while (runSPIMI): while (isMemoryAvailable(maximumRAM)): doc = parser.getContent() if not doc: runSPIMI = False break indexer.createIndex(doc) persister.setTotalNumDocs(parser.numDocs) persister.persistTranslations( sorted(indexer.translation, key=lambda tup: tup[0])) if len(indexer.bestTerms.keys()) > 0: for key in indexer.bestTerms.keys(): persister.persistCache(key, indexer.bestTerms[key]) if not runSPIMI and blockCounter == 1: persister.persist(indexer.index) return 0 else: if persister.persist(indexer.index, auxFile.format(blockCounter)): blockCounter += 1 indexer.clearVar() persister.clearVar() tokenizer.clearTokens() gc.collect() if weightCalc and positionCalc: merger = Merger.PositionWeightMerger( [auxFile.format(x) for x in range(1, blockCounter)], parser.numDocs, outputFolder, fileLimit) elif weightCalc: merger = Merger.WeightMerger( [auxFile.format(x) for x in range(1, blockCounter)], parser.numDocs, outputFolder, fileLimit) elif positionCalc: merger = Merger.PositionMerger( [auxFile.format(x) for x in range(1, blockCounter)], parser.numDocs, outputFolder, fileLimit) else: merger = Merger.SimpleMerger( [auxFile.format(x) for x in range(1, blockCounter)], parser.numDocs, outputFolder, fileLimit) # merging intermediateIndexes tokenizer.clearVar() parser.clearVar() indexer.clearVar() persister.clearVar() del parser del tokenizer del persister runSPIMI = True allDone = False print("Merging...") while (runSPIMI): while not allDone and isMemoryAvailable(maximumRAM): allDone = merger.mergeIndex() if allDone: runSPIMI = False merger.writeIndex() break merger.writeIndex() gc.collect() del merger
def assignment1(tokenizer, outputFolder, inputFolder, limit, weightCalc, positionCalc, fileLimit): """ Follows the execution flow specific for the first assignment. :param tokenizer: class instance to be used in the tokenization process :type tokenizer: Tokenizer :param outputFolder: name of the folder where the final index will be written to :type outputFolder: str :param inputFiles: list of the names of the files containing the textual information to be indexed :type inputFiles: list<str> :param limit: limit number of documents to have in consideration, None if no limit :type limit: int :param weightCalc: True if the term weights are to be calculated, False if not :type weightCalc: bool :param positionCalc: True if the term positions are to be calculated, False if not :type positionCalc: bool """ parser = FileParser.LimitedRamFileParser(inputFolder, limit) indexer = Indexer.FileIndexer(tokenizer, positionCalc, weightCalc, parser) run = True while (run): doc = parser.getContent() if not doc: run = False break indexer.createIndex(doc) if weightCalc and positionCalc: persister = PersistIndex.PersistCSVWeightedPosition( outputFolder, fileLimit, indexer, parser.numDocs) persister.persist() elif weightCalc: persister = PersistIndex.PersistCSVWeighted(outputFolder, fileLimit, indexer, parser.numDocs) persister.persist() elif positionCalc: persister = PersistIndex.PersistCSVPosition(outputFolder, fileLimit, indexer, parser.numDocs) persister.persist() else: persister = PersistIndex.PersistCSV(outputFolder, fileLimit, indexer, parser.numDocs) persister.persist() if len(indexer.bestTerms.keys()) > 0: for key in indexer.bestTerms.keys(): persister.persistCache(key, indexer.bestTerms[key]) persister.persistTranslations( sorted(indexer.translation, key=lambda tup: tup[0])) tokenizer.clearVar() parser.clearVar() indexer.clearVar() persister.clearVar() del parser del indexer del tokenizer del persister gc.collect()