Exemplo n.º 1
0
    def index(self):
        for doi, title, abstract in self.col:
            if self.tokenizerType == '0':  # simple
                tokenizer = Tokenizer.SimpleTokenizer(title, abstract)
            else:  # better
                tokenizer = Tokenizer.BetterTokenizer(title, abstract)

            terms = tokenizer.getTerms()
            for term in terms:
                if term in self.term_map.keys():
                    if doi in self.term_map[term].keys():
                        self.term_map[term][doi] += 1
                    else:
                        self.term_map[term][doi] = 1
                else:
                    term_freq_map = {}  # key: docId, value: term_freq
                    term_freq_map[doi] = 1
                    self.term_map[term] = term_freq_map
Exemplo n.º 2
0
    def index(self):

        self.N = 0

        if self.tokenizerType == '0':  # simple
            tokenizer = Tokenizer.SimpleTokenizer('')
        else:  # better
            tokenizer = Tokenizer.BetterTokenizer('')

        for doi, title, abstract in CorpusReader.CorpusReader(
                self.collectionPath).readCorpus():
            self.N += 1
            tokenizer.changeText(title + " " + abstract)
            terms = tokenizer.getTerms()

            # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} }
            for term in terms:
                if term in self.postingsMaps.keys():
                    if doi in self.postingsMaps[term].keys():
                        self.postingsMaps[term][doi] += 1
                    else:
                        self.postingsMaps[term][doi] = 1
                else:
                    self.postingsMaps[term] = {
                        doi: 1
                    }  # key: docId, value: term_freq

        # lnc (logarithmic term frequency, no document frequency, cosine normalization)
        # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}}
        # logarithmic term frequency
        self.postingsMaps = {
            term: (getIDFt(term, self.postingsMaps, self.N), {
                docId: getLogWeight(term, docId, self.postingsMaps)
                for docId in self.postingsMaps[term].keys()
            })
            for term in self.postingsMaps.keys()
        }
Exemplo n.º 3
0
    def index(self):

        start_indexing = timeit.default_timer()
        self.N = 0

        if self.tokenizerType == '0':  # simple
            tokenizer = Tokenizer.SimpleTokenizer('')
        else:  # better
            tokenizer = Tokenizer.BetterTokenizer('')

        corpusReader = CorpusReader.CorpusReader(self.collectionPath)

        print('start memory available: {}%'.format(
            psutil.virtual_memory().available * 100 /
            psutil.virtual_memory().total))

        corpusReader.startReadingCorpus()
        nDicts = 0

        # ------------------------------------------ INDEX WITH TERM POSITIONS -----------------------------------------
        if self.withPositions:

            # ---------------------------------------------- INDEX BLOCKS ----------------------------------------------
            while True:

                # -------------------------------------------- Get Document --------------------------------------------
                doc = corpusReader.readDoc()

                # last document
                if doc == -1:
                    if self.postingsMaps != {}:
                        # start = timeit.default_timer()
                        self.writeIndexToBlockFileWithPositions(
                            './dicts/dict' + str(nDicts))
                        # stop = timeit.default_timer()
                        # print('write: {} seconds'.format(stop - start))
                        # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                        print('available memory: {} %'.format(
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total))

                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary
                    break
                elif doc == None:
                    continue

                (doi, title, abstract) = doc
                del doc
                self.N += 1
                #startdocreadtime = timeit.default_timer()

                # ------------------------------------------- Get Document Terms ---------------------------------------
                tokenizer.changeText(title + " " + abstract)
                del title
                del abstract
                terms, termPositions = tokenizer.getTerms(withPositions=True)
                tokenizer.changeText("")  # clean term memory from tokenizer

                # first, we populate the dictionary postingsMaps with the term positions {term: {docId: [termpositions]} }

                if (psutil.virtual_memory().available * 100 /
                        psutil.virtual_memory().total
                    ) <= 10 and self.postingsMaps != {}:  # available memory
                    #start = timeit.default_timer()
                    self.writeIndexToBlockFileWithPositions('./dicts/dict' +
                                                            str(nDicts))
                    #stop = timeit.default_timer()
                    #print('write: {} seconds'.format(stop - start))
                    #print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                    print('available memory: {} %'.format(
                        psutil.virtual_memory().available * 100 /
                        psutil.virtual_memory().total))

                    nDicts += 1
                    self.postingsMaps = {}  # clean dictionary
                else:
                    #while terms != [] and termPositions != []:
                    #    if terms[0] in self.postingsMaps.keys():
                    #        #if doi not in self.postingsMaps[terms[0]].keys(): -> doi always not in self.postingsMaps[terms[0]].keys()
                    #        self.postingsMaps[terms[0]][doi] = termPositions[0]
                    #    else:
                    #        self.postingsMaps[terms[0]] = {doi: termPositions[0]}  # key: docId, value: [pos1,pos2,pos3,...]
                    #
                    #    terms = terms[1:]
                    #    termPositions = termPositions[1:]

                    _ = [
                        self.postingsMaps.update({
                            terms[termInd]: {
                                doi: termPositions[termInd]
                            }
                        }) if terms[termInd] not in self.postingsMaps.keys()
                        else self.postingsMaps[terms[termInd]].update(
                            {doi: termPositions[termInd]})
                        for termInd in range(len(terms))
                    ]

                    del terms
                    del termPositions

                #enddocreadtime = timeit.default_timer()
                #print('document {}: {} seconds'.format(doi, enddocreadtime - startdocreadtime))

            # ---------------------------------------- ENDED INDEXING BLOCKS -------------------------------------------
            stop_indexing = timeit.default_timer()
            print('indexing into blocks: {} minutes and {} seconds'.format(
                (stop_indexing - start_indexing) // 60,
                (stop_indexing - start_indexing) % 60))

            start = timeit.default_timer()
            if os.path.isfile("index"):
                os.remove("index")

            final_dict = open("index", "w")
            dict_names = [
                './dicts/dict' + str(nDict) for nDict in range(nDicts)
            ]

            # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------
            print('merging dictionary fase and writting index to disk')

            temp_dicts = [open(dict_name, "r") for dict_name in dict_names]
            ntermsToDisk = 0
            while temp_dicts != []:
                for dict_file in temp_dicts:
                    # ---------------------- Read first line of each file ------------------------------------------
                    line = dict_file.readline()

                    if not line:
                        #print('file: {}, temp_dicts: {}'.format(dict_file, temp_dicts))
                        dict_file.close()
                        # delete dictionary block from disk
                        os.remove(dict_names[temp_dicts.index(dict_file)])
                        dict_names.remove(
                            dict_names[temp_dicts.index(dict_file)])
                        temp_dicts.remove(dict_file)
                        continue

                    # ------------------------ Save line info to memory --------------------------------------------
                    info = line.split(
                        '|'
                    )  # 'term', 'docid', 'pos1,pos2,pos3', 'docid', 'pos1,pos2,pos3', ...
                    info.remove('\n')
                    while '' in info:
                        info.remove('')
                    term = info[0]  # term
                    #print('term: {}'.format(term))
                    docIds = info[1:][0::2]  # [docid, docid, ...]
                    #print('docIds: {}'.format(docIds))
                    termPositions = [
                        positions.split(',') for positions in info[1:][1::2]
                    ]  # [[pos1,pos2,pos3], [pos1,pos2,pos3], ...]
                    #print('termPositions: {}'.format(termPositions))
                    #print('postingsMaps: {}'.format(list(self.postingsMaps.items())))
                    if term in self.postingsMaps.keys():
                        #for docId in docIds:
                        # if docId in line_temp_dict[term].keys(): -> doesnt happpen because we only write to file after reading the whole document
                        # merge postings list (update in order if dict document)
                        self.postingsMaps[term].update({
                            docIds[docInd]: termPositions[docInd]
                            for docInd in range(len(docIds))
                        })
                    else:
                        self.postingsMaps.update({
                            term: {
                                docIds[docInd]: termPositions[docInd]
                                for docInd in range(len(docIds))
                            }
                        })

                if self.postingsMaps != {}:
                    # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX -----------------------------
                    # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure
                    # get first element of alphabetical sorted list of terms in memory
                    minorTerm = sorted(self.postingsMaps.keys())[0]
                    #print('[\'-Complex@ZIF-67:qgdvdy3k:1|gltf4m6w:1|\n:5.422985219043376|\n\']')
                    #print('term: ' + minorTerm)
                    #print('idf: ' + str(getIDFt(minorTerm, self.postingsMaps, self.N)))
                    #print('doc_ids: ' + ''.join([str(doc_id) for doc_id, positions in self.postingsMaps[minorTerm].items()]))
                    #print('LogWeightPositions: ' + ''.join([str(getLogWeightPositions(minorTerm, doc_id, self.postingsMaps)) for doc_id, positions in self.postingsMaps[minorTerm].items()]))
                    #print('positions: ' + ','.join([','.join([str(pos) for pos in positions]) for doc_id, positions in self.postingsMaps[minorTerm].items()]))

                    # write its information to the final dictionary\
                    final_dict.writelines([
                        minorTerm + ':' +  # term:
                        str(getIDFt(minorTerm, self.postingsMaps, self.N)) +
                        '|' +  # idf|
                        '|'.join([
                            str(doc_id) + ':' +  # doc_id:
                            str(
                                getLogWeightPositions(minorTerm, doc_id,
                                                      self.postingsMaps)) +
                            ':' +  # term_weight:
                            ','.join([str(pos)
                                      for pos in positions])  # pos1,pos2,...
                            for doc_id, positions in
                            self.postingsMaps[minorTerm].items()
                        ]) + '\n'
                    ])

                    ntermsToDisk += 1
                    #print('merging dictionary fase: writed into disk {} terms'.format(ntermsToDisk))

                    # remove it from memory
                    del self.postingsMaps[minorTerm]

            # ---------------------------------------- ENDED MERGING INDEX BLOCKS --------------------------------------
            del info
            del term
            del docIds
            del termPositions
            del minorTerm

            final_dict.close()

            stop = timeit.default_timer()
            print(
                'merge and write of final dictionary: {} minutes and {} seconds'
                .format((stop - start) // 60, (stop - start) % 60))

        # ----------------------------------------- INDEX WITHOUT TERM POSITIONS ---------------------------------------
        else:
            # ---------------------------------------------- INDEX BLOCKS ----------------------------------------------
            while True:
                doc = corpusReader.readDoc()
                # last document
                if doc == -1:
                    if self.postingsMaps != {}:
                        # start = timeit.default_timer()
                        self.writeIndexToBlockFile('./dicts/dict' +
                                                   str(nDicts))
                        # stop = timeit.default_timer()
                        # print('write: {} seconds'.format(stop - start))
                        # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100))
                        print('available memory: {} %'.format(
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total))

                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary
                    break
                elif doc == None:
                    continue

                (doi, title, abstract) = doc
                del doc
                self.N += 1

                # ------------------------------------------- Get Document Terms ---------------------------------------
                tokenizer.changeText(title + " " + abstract)
                del title
                del abstract
                terms = tokenizer.getTerms(withPositions=False)
                tokenizer.changeText("")  # clean term memory from tokenizer

                # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} }
                nDicts = 0

                for term in terms:
                    if (
                            psutil.virtual_memory().available * 100 /
                            psutil.virtual_memory().total
                    ) <= 10 and self.postingsMaps != {}:  # available memory
                        # lnc (logarithmic term frequency, no document frequency, cosine normalization)
                        # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}}
                        # logarithmic term frequency
                        self.writeIndexToBlockFile('./dicts/dict' +
                                                   str(nDicts))
                        nDicts += 1
                        self.postingsMaps = {}  # clean dictionary

                    if term in self.postingsMaps.keys():
                        if doi in self.postingsMaps[term].keys():
                            self.postingsMaps[term][doi] += 1
                        else:
                            self.postingsMaps[term][doi] = 1
                    else:
                        self.postingsMaps[term] = {
                            doi: 1
                        }  # key: docId, value: term_freq

            # ---------------------------------------- ENDED INDEXING BLOCKS -------------------------------------------
            start = timeit.default_timer()
            if os.path.isfile("index"):
                os.remove("index")
            final_dict = open("index", "w")
            dict_names = [
                './dicts/dict' + str(nDict) for nDict in range(nDicts)
            ]

            # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------
            temp_dicts = [open(dict_name, "r") for dict_name in dict_names]
            while temp_dicts != []:
                for dict_file in temp_dicts:
                    # ---------------------- Read first line of each file ------------------------------------------
                    line = dict_file.readline()

                    if not line:
                        dict_file.close()
                        # delete dictionary block from disk
                        os.remove(dict_names[temp_dicts.index(dict_file)])
                        dict_names.remove(
                            dict_names[temp_dicts.index(dict_file)])
                        temp_dicts.remove(dict_file)
                        continue

                    # ------------------------ Save line info to memory --------------------------------------------
                    info = line.split(
                        '|'
                    )  # 'term', 'docid', 'term_freq', 'docid', 'term_freq', ...
                    info.remove('\n')
                    while '' in info:
                        info.remove('')
                    term = info[0]  # term
                    docIds = info[1:][0::2]  # [docid, docid, ...]
                    termFreqs = info[1:][1::2]  # [term_freq, term_freq, ...]

                    if term in self.postingsMaps.keys():
                        self.postingsMaps[term].update({
                            docIds[docInd]: termFreqs[docInd]
                            for docInd in range(len(docIds))
                        })
                    else:
                        self.postingsMaps.update({
                            term: {
                                docIds[docInd]: termFreqs[docInd]
                                for docInd in range(len(docIds))
                            }
                        })

                if self.postingsMaps != {}:

                    # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX -----------------------------
                    # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure
                    # get first element of alphabetical sorted list of terms in memory
                    minorTerm = sorted(self.postingsMaps.keys())[0]

                    # write its information to the final dictionary
                    final_dict.writelines([
                        minorTerm + ':' +  # term:
                        str(getIDFt(minorTerm, self.postingsMaps, self.N)) +
                        '|' +  # idf|
                        '|'.join([
                            str(doc_id) + ':' +  # doc_id:
                            str(
                                getLogWeight(minorTerm, doc_id, self.
                                             postingsMaps))  # term_weight|
                            for doc_id, positions in
                            self.postingsMaps[minorTerm].items()
                        ]) + '\n'
                    ])

                    # remove it from memory
                    del self.postingsMaps[minorTerm]

            # ---------------------------------------- ENDED MERGING INDEX BLOCKS --------------------------------------
            del info
            del term
            del docIds
            del termFreqs
            del minorTerm

            final_dict.close()

            stop = timeit.default_timer()
            print(
                'merge and write of final dictionary: {} minutes and {} seconds'
                .format((stop - start) // 60, (stop - start) % 60))