def LoadBagOfWordsDataFromFile_ldac_cython(filepath, nTokensPerByte=0.2, **kwargs): if not hasCythonExt: warnings.warn("Cython extension TextFileReaderX not found." + " Falling back to pure python.") return BagOfWordsData.LoadFromFile_ldac_python(filepath, **kwargs) filesize_bytes = os.path.getsize(filepath) nUniqueTokens = int(nTokensPerByte * filesize_bytes) try: dptr = np.zeros(nUniqueTokens, dtype=np.int32) wids = np.zeros(nUniqueTokens, dtype=np.int32) wcts = np.zeros(nUniqueTokens, dtype=np.float64) stop, dstop = read_from_ldac_file(filepath, nUniqueTokens, dptr, wids, wcts) return BagOfWordsData(word_id=wids[:stop], word_count=wcts[:stop], doc_range=dptr[:dstop], **kwargs) except IndexError as e: return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte * 2, **kwargs)
def loadDataForBatch(self, batchID): ''' Load the data assigned to a particular batch Returns ------- Dchunk : bnpy.data.DataObj subclass ''' dpath = self.datafileList[batchID] if dpath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(dpath, **self.DataInfo) elif self.dataset_type == 'GroupXData': return GroupXData.LoadFromFile(dpath, **self.DataInfo) else: return XData.read_file(dpath, **self.DataInfo)
def loadDataForSlice(filepath='', dataset_type='', **kwargs): """ Return data object loaded from specific file. Keyword args ------------ workerID nWorkers """ if filepath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(filepath, **kwargs) else: if dataset_type == 'GroupXData': return GroupXData.LoadFromFile(filepath, **kwargs) else: return XData.LoadFromFile(filepath, **kwargs)
def LoadBagOfWordsDataFromFile_ldac_cython(filepath, nTokensPerByte=0.2, **kwargs): filesize_bytes = os.path.getsize(filepath) nUniqueTokens = int(nTokensPerByte * filesize_bytes) try: dptr = np.zeros(nUniqueTokens, dtype=np.int32) wids = np.zeros(nUniqueTokens, dtype=np.int32) wcts = np.zeros(nUniqueTokens, dtype=np.float64) stop, dstop = read_from_ldac_file( filepath, nUniqueTokens, dptr, wids, wcts) return BagOfWordsData( word_id=wids[:stop], word_count=wcts[:stop], doc_range=dptr[:dstop], **kwargs) except IndexError as e: return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte*2, **kwargs)
def LoadBagOfWordsDataFromFile_ldac_cython(filepath, nTokensPerByte=0.2, **kwargs): filesize_bytes = os.path.getsize(filepath) nUniqueTokens = int(nTokensPerByte * filesize_bytes) try: dptr = np.zeros(nUniqueTokens, dtype=np.int32) wids = np.zeros(nUniqueTokens, dtype=np.int32) wcts = np.zeros(nUniqueTokens, dtype=np.float64) stop, dstop = read_from_ldac_file( filepath, nUniqueTokens, dptr, wids, wcts) return BagOfWordsData( word_id=wids[:stop], word_count=wcts[:stop], doc_range=dptr[:dstop], **kwargs) except IndexError as e: return LoadBagOfWordsDataFromFile_ldac(filepath, nTokensPerByte*2, **kwargs) if __name__ == '__main__': #fpath = '/ltmp/testNYT.ldac' fpath = '/data/liv/textdatasets/nytimes/batches/batch111.ldac' vocab_size=8000 fastD = LoadBagOfWordsDataFromFile_ldac_cython(fpath, vocab_size=vocab_size) slowD = BagOfWordsData.LoadFromFile_ldac_python(fpath, vocab_size=vocab_size) print(fastD.word_id[:10]) print(slowD.word_id[:10]) assert np.allclose(fastD.word_id, slowD.word_id) assert np.allclose(fastD.word_count, slowD.word_count) assert np.allclose(fastD.doc_range, slowD.doc_range)