def createStatistics(self, statisticObject, readerSourceData, normalization, sourceCustomCallback=None): """ :param statisticObject: объект для создания статистики :param readerSourceData: объект для получения полных путей к файлам :param normalization: объект для нормализации данных :param sourceCustomCallback: колбэк для объекта получения настроек (верификация путей) """ if not statisticObject: raise ParamError("statisticObject cannot be the None-object") if not isinstance(statisticObject, Statistic): raise TypeError("statisticObject can be the list Statistic") if not readerSourceData: raise ParamError("readerSourceData cannot be the None-object") if not isinstance(readerSourceData, ReaderSourceData): raise TypeError("readerSourceData can be the list ReaderSourceData") if sourceCustomCallback and not isinstance(sourceCustomCallback, SourceCustomCallback): raise TypeError("sourceCustomCallback can be the list SourceCustomCallback") de_object = DetectEncoding() fileSourceCustom = FileSourceCustom() for itemFile in readerSourceData.getSourceCustom(sourceCustomCallback): file_type = magic.from_file(itemFile.decode(de_object.getEncode(itemFile)), mime=True) fileSourceCustom.custom = itemFile source = None if file_type == MIME_TEXT: source = self._source['text'] elif file_type == MIME_WORD: source = self._source['word'] if source: statisticObject.makeDocStatisticCustom(source, fileSourceCustom, normalization)
class TestDetectEncoding(unittest.TestCase): def setUp(self): self.__detectEncoding = DetectEncoding() self.__dirPath = os.path.abspath(os.curdir) def testGetEncode(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_utf8") with open(filePath) as utf8File: utf8Encode = self.__detectEncoding.getEncode(utf8File.read()) self.assertEqual(utf8Encode, "utf-8", "fail detect encode test_encode_utf8") filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: win1251Encode = self.__detectEncoding.getEncode(win1251File.read()) self.assertEqual(win1251Encode, "windows-1251", "fail detect encode test_encode_win1251") filePath = os.path.join(self.__dirPath, "resources/test_encode_win866") with open(filePath) as win866File: win866Encode = self.__detectEncoding.getEncode(win866File.read()) self.assertEqual(win866Encode, "IBM866", "fail detect encode test_encode_win866") def testGetEncode1TypeError(self): self.assertRaises(TypeError, self.__detectEncoding.getEncode, 123) self.assertRaises(TypeError, self.__detectEncoding.getEncode, None) def testEncode(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: utf8Text = self.__detectEncoding.encodeText( win1251File.read(), "utf-8") utf8Encode = self.__detectEncoding.getEncode(utf8Text) self.assertEqual(utf8Encode, "utf-8", "fail encode text from test_encode_win1251") def testEncodeLookupError(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: self.assertRaises(LookupError, self.__detectEncoding.encodeText, win1251File.read(), "utf-8_test_")
class TestDetectEncoding(unittest.TestCase): def setUp(self): self.__detectEncoding = DetectEncoding() self.__dirPath = os.path.abspath(os.curdir) def testGetEncode(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_utf8") with open(filePath) as utf8File: utf8Encode = self.__detectEncoding.getEncode(utf8File.read()) self.assertEqual(utf8Encode, "utf-8", "fail detect encode test_encode_utf8") filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: win1251Encode = self.__detectEncoding.getEncode(win1251File.read()) self.assertEqual(win1251Encode, "windows-1251", "fail detect encode test_encode_win1251") filePath = os.path.join(self.__dirPath, "resources/test_encode_win866") with open(filePath) as win866File: win866Encode = self.__detectEncoding.getEncode(win866File.read()) self.assertEqual(win866Encode, "IBM866", "fail detect encode test_encode_win866") def testGetEncode1TypeError(self): self.assertRaises(TypeError, self.__detectEncoding.getEncode, 123) self.assertRaises(TypeError, self.__detectEncoding.getEncode, None) def testEncode(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: utf8Text = self.__detectEncoding.encodeText(win1251File.read(), "utf-8") utf8Encode = self.__detectEncoding.getEncode(utf8Text) self.assertEqual(utf8Encode, "utf-8", "fail encode text from test_encode_win1251") def testEncodeLookupError(self): filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251") with open(filePath) as win1251File: self.assertRaises(LookupError, self.__detectEncoding.encodeText, win1251File.read(), "utf-8_test_")
def setUp(self): self.__detectEncoding = DetectEncoding() self.__dirPath = os.path.abspath(os.curdir)
def __init__(self, mongoUtils): self.__bufferSize = 2048 self.__mongoUtils = mongoUtils self.__bufferDictID = None self.__bufferDict = {} self.__detectEncoding = DetectEncoding()
class MongoStatistic(Statistic): """ Формирование и сохранение индекса в mongodb """ def __init__(self, mongoUtils): self.__bufferSize = 2048 self.__mongoUtils = mongoUtils self.__bufferDictID = None self.__bufferDict = {} self.__detectEncoding = DetectEncoding() def makeDocStatistic(self, sourceName, ss, data, normalizationCallback): if not normalizationCallback: raise ParamError("normalizationCallback not to be a None") if not isinstance(normalizationCallback, Normalization): raise ParamError("normalizationCallback is not instance Normalization") sn = self.__detectEncoding.encodeText(sourceName) sen = self.__detectEncoding.getEncode(sourceName) sde = normalizationCallback.getNormalizeTextEncode() sdc = datetime.datetime.now() normalizeData = normalizationCallback.normalizeText(data) # нормализация полученного текста self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc) # сохранение self.__saveDict(True) # сохранение словаря, если он что-то содержит def makeDocStatisticCustom(self, openSourceCallback, sourceCustom, normalizationCallback): if not openSourceCallback or not normalizationCallback or not sourceCustom: raise ParamError("openSourceCallback or normalizationCallback or sourceCustom not to be a None") if not isinstance(openSourceCallback, Source): raise ParamError("openSourceCallback is not instance Source") if not isinstance(sourceCustom, SourceCustom): raise ParamError("sourceCustom is not instance SourceCustom") if not isinstance(normalizationCallback, Normalization): raise ParamError("normalizationCallback is not instance Normalization") openSource = openSourceCallback.openSource(sourceCustom.getCustom()) isSave = True for itemData in openSourceCallback.read(openSource): try: normalizeData = normalizationCallback.normalizeText(itemData) # нормализация полученного текста if isSave: # добавление нового словарного индекса sn = self.__detectEncoding.encodeText(openSourceCallback.getName(openSource)) # имя sen = self.__detectEncoding.getEncode(sn) ss = openSourceCallback.getSourceSize(openSource) # размер в kB sde = normalizationCallback.getNormalizeTextEncode() sdc = datetime.datetime.now() self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc) # сохранение isSave = False else: # добавление данных к уже существующему индексу self.__makeDocIndex(normalizeData, False) except ParamError: pass self.__saveDict(True) # сохранение словаря, если он что-то содержит openSourceCallback.closeSource(openSource) def makeTotalStatistic(self): self.__mongoUtils.mergeDicts() def addMoreStatistics(self, calc): self.__mongoUtils.addMoreStatistics(calc) def getBufferSize(self): return self.__bufferSize def setBufferSize(self, bufferSize): self.__bufferSize = bufferSize def getMainStatisticID(self): return self.__mongoUtils.getMergeDictID() def __makeDocIndex(self, data, createNewDocIndex=False, sn=None, sen=None, ss=0, sde=None, sdc=None): """ Создание индекса по документу :param data: данные для индекса :param createNewDocIndex: создавать ли новый индекс по документу (True - да) :param sn: source name - имя источника (для создания индекса) :param sen: sourceEncodeName - кодировка имени источника :param ss: source size - размер всех данных источника :param sde: source data encode - кодировка данных источника :param sdc: source data created - дата создания индекса по источнику """ if createNewDocIndex: # создание нового документа self.__bufferDictID = self.__mongoUtils.saveDict(sn, sen, ss, {}, sde, sdc) for itemWord in data: # добавление слов в словарь if itemWord in self.__bufferDict: self.__bufferDict[itemWord] += 1 else: self.__bufferDict[itemWord] = 1 sizeDict = sys.getsizeof(self.__bufferDict, -1) if sizeDict == -1: raise TypeError("Object does not provide means to retrieve the size (see docs)") if sizeDict > self.getBufferSize() * 1024: self.__saveDict() def __saveDict(self, cleanDictID=False): """ Сохранение данных своваря :param cleanDictID: true - удалить сключ словаря """ if self.__bufferDictID and self.__bufferDict: self.__mongoUtils.add2Dict(self.__bufferDictID, self.__bufferDict) self.__bufferDict = {} # обновление словаря if cleanDictID: # обновление ключа self.__bufferDictID = None bufferSize = property(getBufferSize, setBufferSize)
class MongoStatistic(Statistic): """ Формирование и сохранение индекса в mongodb """ def __init__(self, mongoUtils): self.__bufferSize = 2048 self.__mongoUtils = mongoUtils self.__bufferDictID = None self.__bufferDict = {} self.__detectEncoding = DetectEncoding() def makeDocStatistic(self, sourceName, ss, data, normalizationCallback): if not normalizationCallback: raise ParamError("normalizationCallback not to be a None") if not isinstance(normalizationCallback, Normalization): raise ParamError( "normalizationCallback is not instance Normalization") sn = self.__detectEncoding.encodeText(sourceName) sen = self.__detectEncoding.getEncode(sourceName) sde = normalizationCallback.getNormalizeTextEncode() sdc = datetime.datetime.now() normalizeData = normalizationCallback.normalizeText( data) # нормализация полученного текста self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc) # сохранение self.__saveDict(True) # сохранение словаря, если он что-то содержит def makeDocStatisticCustom(self, openSourceCallback, sourceCustom, normalizationCallback): if not openSourceCallback or not normalizationCallback or not sourceCustom: raise ParamError( "openSourceCallback or normalizationCallback or sourceCustom not to be a None" ) if not isinstance(openSourceCallback, Source): raise ParamError("openSourceCallback is not instance Source") if not isinstance(sourceCustom, SourceCustom): raise ParamError("sourceCustom is not instance SourceCustom") if not isinstance(normalizationCallback, Normalization): raise ParamError( "normalizationCallback is not instance Normalization") openSource = openSourceCallback.openSource(sourceCustom.getCustom()) isSave = True for itemData in openSourceCallback.read(openSource): try: normalizeData = normalizationCallback.normalizeText( itemData) # нормализация полученного текста if isSave: # добавление нового словарного индекса sn = self.__detectEncoding.encodeText( openSourceCallback.getName(openSource)) # имя sen = self.__detectEncoding.getEncode(sn) ss = openSourceCallback.getSourceSize( openSource) # размер в kB sde = normalizationCallback.getNormalizeTextEncode() sdc = datetime.datetime.now() self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc) # сохранение isSave = False else: # добавление данных к уже существующему индексу self.__makeDocIndex(normalizeData, False) except ParamError: pass self.__saveDict(True) # сохранение словаря, если он что-то содержит openSourceCallback.closeSource(openSource) def makeTotalStatistic(self): self.__mongoUtils.mergeDicts() def addMoreStatistics(self, calc): self.__mongoUtils.addMoreStatistics(calc) def getBufferSize(self): return self.__bufferSize def setBufferSize(self, bufferSize): self.__bufferSize = bufferSize def getMainStatisticID(self): return self.__mongoUtils.getMergeDictID() def __makeDocIndex(self, data, createNewDocIndex=False, sn=None, sen=None, ss=0, sde=None, sdc=None): """ Создание индекса по документу :param data: данные для индекса :param createNewDocIndex: создавать ли новый индекс по документу (True - да) :param sn: source name - имя источника (для создания индекса) :param sen: sourceEncodeName - кодировка имени источника :param ss: source size - размер всех данных источника :param sde: source data encode - кодировка данных источника :param sdc: source data created - дата создания индекса по источнику """ if createNewDocIndex: # создание нового документа self.__bufferDictID = self.__mongoUtils.saveDict( sn, sen, ss, {}, sde, sdc) for itemWord in data: # добавление слов в словарь if itemWord in self.__bufferDict: self.__bufferDict[itemWord] += 1 else: self.__bufferDict[itemWord] = 1 sizeDict = sys.getsizeof(self.__bufferDict, -1) if sizeDict == -1: raise TypeError( "Object does not provide means to retrieve the size (see docs)" ) if sizeDict > self.getBufferSize() * 1024: self.__saveDict() def __saveDict(self, cleanDictID=False): """ Сохранение данных своваря :param cleanDictID: true - удалить сключ словаря """ if self.__bufferDictID and self.__bufferDict: self.__mongoUtils.add2Dict(self.__bufferDictID, self.__bufferDict) self.__bufferDict = {} # обновление словаря if cleanDictID: # обновление ключа self.__bufferDictID = None bufferSize = property(getBufferSize, setBufferSize)