Пример #1
0
 def __init__(self, blockfiles, file_name="mf.txt", out_dir="./merged"):
     self.file_name = file_name
     self.out_dir = out_dir
     self.block_files = blockfiles
     self.get_out_dir()
     self.prep_output()
     self.out_file = core.BlockFile(os.path.join(self.out_dir, self.file_name))
Пример #2
0
    def index(self):
        """
        Method to actual index the corpus using the SPIMI algorithm
        Compresses dictionary depending on parameters passed in class constructor
        Writes blocks to inverted block files in output directory
        :return:
        """
        done = False
        while not done:
            block_dict = {}
            try:
                while sys.getsizeof(
                        block_dict) / 1024 / 1024 <= self.block_size:
                    token = self.tokens.next()
                    if token[0] not in block_dict:
                        block_dict[token[0]] = list()
                        block_dict[token[0]].append(token[1])
                    else:
                        block_dict[token[0]].append(token[1])
            except StopIteration:
                print "Parsed all tokens in all documents"
                done = True

            sorted_block = [str(term) for term in sorted(block_dict.keys())]
            block_name = self.block_prefix + str(self.block_index) + ".txt"
            outFile = core.BlockFile(os.path.join(self.out_dir, block_name))
            outFile.open_file(mode="w")
            for element in sorted_block:
                docids = " ".join(str(doc) for doc in block_dict[element])
                outString = element + " " + docids
                outFile.write_line(outString + "\n")
            outFile.close_file()
            self.block_index += 1
            self.blocklist.append(os.path.join(self.out_dir, block_name))
Пример #3
0
 def prep_files(self):
     """
     Reads all inverted block files to prepare for merger function
     :return: List of open File classes for block files
     """
     open_files = []
     for out_file in self.block_files:
         open_files.append(core.BlockFile(out_file))
     return open_files
Пример #4
0
 def get_index(self):
     """
     Populates class with merged spimi index file to allow for in-memory querying
     :return:
     """
     in_file = core.BlockFile(self.merge)
     in_file.open_file()
     in_line = in_file.read_line()
     while in_line:
         self.index[in_line.term] = in_line.postings
         in_line = in_file.read_line()