Python DetectEncoding示例，statistic4text.utils.normalization_utils.DetectEncoding Python示例

示例#1

0

显示文件

文件： datasource_worker_utils.py 项目： romus/iRetrieval

    def createStatistics(self, statisticObject, readerSourceData, normalization, sourceCustomCallback=None):
        """

        :param statisticObject:  объект для создания статистики
        :param readerSourceData:  объект для получения полных путей к файлам
        :param normalization:  объект для нормализации данных
        :param sourceCustomCallback:  колбэк для объекта получения настроек (верификация путей)
        """
        if not statisticObject:
            raise ParamError("statisticObject cannot be the None-object")
        if not isinstance(statisticObject, Statistic):
            raise TypeError("statisticObject can be the list Statistic")

        if not readerSourceData:
            raise ParamError("readerSourceData cannot be the None-object")
        if not isinstance(readerSourceData, ReaderSourceData):
            raise TypeError("readerSourceData can be the list ReaderSourceData")

        if sourceCustomCallback and not isinstance(sourceCustomCallback, SourceCustomCallback):
            raise TypeError("sourceCustomCallback can be the list SourceCustomCallback")

        de_object = DetectEncoding()
        fileSourceCustom = FileSourceCustom()
        for itemFile in readerSourceData.getSourceCustom(sourceCustomCallback):
            file_type = magic.from_file(itemFile.decode(de_object.getEncode(itemFile)), mime=True)
            fileSourceCustom.custom = itemFile

            source = None
            if file_type == MIME_TEXT:
                source = self._source['text']
            elif file_type == MIME_WORD:
                source = self._source['word']

            if source:
                statisticObject.makeDocStatisticCustom(source, fileSourceCustom, normalization)

示例#2

0

显示文件

文件： test_normalization_utils.py 项目： romus/statistic4text

class TestDetectEncoding(unittest.TestCase):
    def setUp(self):
        self.__detectEncoding = DetectEncoding()
        self.__dirPath = os.path.abspath(os.curdir)

    def testGetEncode(self):
        filePath = os.path.join(self.__dirPath, "resources/test_encode_utf8")
        with open(filePath) as utf8File:
            utf8Encode = self.__detectEncoding.getEncode(utf8File.read())
            self.assertEqual(utf8Encode, "utf-8",
                             "fail detect encode test_encode_utf8")

        filePath = os.path.join(self.__dirPath,
                                "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            win1251Encode = self.__detectEncoding.getEncode(win1251File.read())
            self.assertEqual(win1251Encode, "windows-1251",
                             "fail detect encode test_encode_win1251")

        filePath = os.path.join(self.__dirPath, "resources/test_encode_win866")
        with open(filePath) as win866File:
            win866Encode = self.__detectEncoding.getEncode(win866File.read())
            self.assertEqual(win866Encode, "IBM866",
                             "fail detect encode test_encode_win866")

    def testGetEncode1TypeError(self):
        self.assertRaises(TypeError, self.__detectEncoding.getEncode, 123)
        self.assertRaises(TypeError, self.__detectEncoding.getEncode, None)

    def testEncode(self):
        filePath = os.path.join(self.__dirPath,
                                "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            utf8Text = self.__detectEncoding.encodeText(
                win1251File.read(), "utf-8")
            utf8Encode = self.__detectEncoding.getEncode(utf8Text)
            self.assertEqual(utf8Encode, "utf-8",
                             "fail encode text from test_encode_win1251")

    def testEncodeLookupError(self):
        filePath = os.path.join(self.__dirPath,
                                "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            self.assertRaises(LookupError, self.__detectEncoding.encodeText,
                              win1251File.read(), "utf-8_test_")

示例#3

0

显示文件

文件： test_normalization_utils.py 项目： romus/statistic4text

class TestDetectEncoding(unittest.TestCase):

    def setUp(self):
        self.__detectEncoding = DetectEncoding()
        self.__dirPath = os.path.abspath(os.curdir)

    def testGetEncode(self):
        filePath = os.path.join(self.__dirPath, "resources/test_encode_utf8")
        with open(filePath) as utf8File:
            utf8Encode = self.__detectEncoding.getEncode(utf8File.read())
            self.assertEqual(utf8Encode, "utf-8", "fail detect encode test_encode_utf8")

        filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            win1251Encode = self.__detectEncoding.getEncode(win1251File.read())
            self.assertEqual(win1251Encode, "windows-1251", "fail detect encode test_encode_win1251")

        filePath = os.path.join(self.__dirPath, "resources/test_encode_win866")
        with open(filePath) as win866File:
            win866Encode = self.__detectEncoding.getEncode(win866File.read())
            self.assertEqual(win866Encode, "IBM866", "fail detect encode test_encode_win866")

    def testGetEncode1TypeError(self):
        self.assertRaises(TypeError, self.__detectEncoding.getEncode, 123)
        self.assertRaises(TypeError, self.__detectEncoding.getEncode, None)

    def testEncode(self):
        filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            utf8Text = self.__detectEncoding.encodeText(win1251File.read(), "utf-8")
            utf8Encode = self.__detectEncoding.getEncode(utf8Text)
            self.assertEqual(utf8Encode, "utf-8", "fail encode text from test_encode_win1251")

    def testEncodeLookupError(self):
        filePath = os.path.join(self.__dirPath, "resources/test_encode_win1251")
        with open(filePath) as win1251File:
            self.assertRaises(LookupError, self.__detectEncoding.encodeText, win1251File.read(), "utf-8_test_")

示例#4

0

显示文件

文件： test_normalization_utils.py 项目： romus/statistic4text

 def setUp(self):
     self.__detectEncoding = DetectEncoding()
     self.__dirPath = os.path.abspath(os.curdir)

示例#5

0

显示文件

文件： statistic.py 项目： romus/statistic4text

 def __init__(self, mongoUtils):
     self.__bufferSize = 2048
     self.__mongoUtils = mongoUtils
     self.__bufferDictID = None
     self.__bufferDict = {}
     self.__detectEncoding = DetectEncoding()

示例#6

0

显示文件

文件： statistic.py 项目： romus/statistic4text

class MongoStatistic(Statistic):
    """ Формирование и сохранение индекса в mongodb """

    def __init__(self, mongoUtils):
        self.__bufferSize = 2048
        self.__mongoUtils = mongoUtils
        self.__bufferDictID = None
        self.__bufferDict = {}
        self.__detectEncoding = DetectEncoding()

    def makeDocStatistic(self, sourceName, ss, data, normalizationCallback):
        if not normalizationCallback:
            raise ParamError("normalizationCallback not to be a None")

        if not isinstance(normalizationCallback, Normalization):
            raise ParamError("normalizationCallback is not instance Normalization")

        sn = self.__detectEncoding.encodeText(sourceName)
        sen = self.__detectEncoding.getEncode(sourceName)
        sde = normalizationCallback.getNormalizeTextEncode()
        sdc = datetime.datetime.now()
        normalizeData = normalizationCallback.normalizeText(data)  # нормализация полученного текста
        self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc)  # сохранение
        self.__saveDict(True)  # сохранение словаря, если он что-то содержит

    def makeDocStatisticCustom(self, openSourceCallback, sourceCustom, normalizationCallback):
        if not openSourceCallback or not normalizationCallback or not sourceCustom:
            raise ParamError("openSourceCallback or normalizationCallback or sourceCustom  not to be a None")

        if not isinstance(openSourceCallback, Source):
            raise ParamError("openSourceCallback is not instance Source")

        if not isinstance(sourceCustom, SourceCustom):
            raise ParamError("sourceCustom is not instance SourceCustom")

        if not isinstance(normalizationCallback, Normalization):
            raise ParamError("normalizationCallback is not instance Normalization")

        openSource = openSourceCallback.openSource(sourceCustom.getCustom())
        isSave = True
        for itemData in openSourceCallback.read(openSource):
            try:
                normalizeData = normalizationCallback.normalizeText(itemData)  # нормализация полученного текста
                if isSave:  # добавление нового словарного индекса
                    sn = self.__detectEncoding.encodeText(openSourceCallback.getName(openSource))  # имя
                    sen = self.__detectEncoding.getEncode(sn)
                    ss = openSourceCallback.getSourceSize(openSource)  # размер в kB
                    sde = normalizationCallback.getNormalizeTextEncode()
                    sdc = datetime.datetime.now()
                    self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde, sdc)  # сохранение
                    isSave = False
                else:  # добавление данных к уже существующему индексу
                    self.__makeDocIndex(normalizeData, False)
            except ParamError:
                pass
        self.__saveDict(True)  # сохранение словаря, если он что-то содержит

        openSourceCallback.closeSource(openSource)

    def makeTotalStatistic(self):
        self.__mongoUtils.mergeDicts()

    def addMoreStatistics(self, calc):
        self.__mongoUtils.addMoreStatistics(calc)

    def getBufferSize(self):
        return self.__bufferSize

    def setBufferSize(self, bufferSize):
        self.__bufferSize = bufferSize

    def getMainStatisticID(self):
        return self.__mongoUtils.getMergeDictID()

    def __makeDocIndex(self, data, createNewDocIndex=False, sn=None, sen=None, ss=0, sde=None, sdc=None):
        """
        Создание индекса по документу

        :param data:  данные для индекса
        :param createNewDocIndex:  создавать ли новый индекс по документу (True - да)
        :param sn:  source name - имя источника  (для создания индекса)
        :param sen:  sourceEncodeName - кодировка имени источника
        :param ss:  source size - размер всех данных источника
        :param sde:  source data encode - кодировка данных источника
        :param sdc:  source data created - дата создания индекса по источнику
        """
        if createNewDocIndex:  # создание нового документа
            self.__bufferDictID = self.__mongoUtils.saveDict(sn, sen, ss, {}, sde, sdc)

        for itemWord in data:  # добавление слов в словарь
            if itemWord in self.__bufferDict:
                self.__bufferDict[itemWord] += 1
            else:
                self.__bufferDict[itemWord] = 1

            sizeDict = sys.getsizeof(self.__bufferDict, -1)
            if sizeDict == -1:
                raise TypeError("Object does not provide means to retrieve the size (see docs)")
            if sizeDict > self.getBufferSize() * 1024:
                self.__saveDict()

    def __saveDict(self, cleanDictID=False):
        """
        Сохранение данных своваря

        :param cleanDictID:  true - удалить сключ словаря
        """
        if self.__bufferDictID and self.__bufferDict:
            self.__mongoUtils.add2Dict(self.__bufferDictID, self.__bufferDict)
            self.__bufferDict = {}  # обновление словаря
            if cleanDictID:  # обновление ключа
                self.__bufferDictID = None

    bufferSize = property(getBufferSize, setBufferSize)

示例#7

0

显示文件

文件： test_normalization_utils.py 项目： romus/statistic4text

 def setUp(self):
     self.__detectEncoding = DetectEncoding()
     self.__dirPath = os.path.abspath(os.curdir)

示例#8

0

显示文件

文件： statistic.py 项目： romus/statistic4text

 def __init__(self, mongoUtils):
     self.__bufferSize = 2048
     self.__mongoUtils = mongoUtils
     self.__bufferDictID = None
     self.__bufferDict = {}
     self.__detectEncoding = DetectEncoding()

示例#9

0

显示文件

文件： statistic.py 项目： romus/statistic4text

class MongoStatistic(Statistic):
    """ Формирование и сохранение индекса в mongodb """
    def __init__(self, mongoUtils):
        self.__bufferSize = 2048
        self.__mongoUtils = mongoUtils
        self.__bufferDictID = None
        self.__bufferDict = {}
        self.__detectEncoding = DetectEncoding()

    def makeDocStatistic(self, sourceName, ss, data, normalizationCallback):
        if not normalizationCallback:
            raise ParamError("normalizationCallback not to be a None")

        if not isinstance(normalizationCallback, Normalization):
            raise ParamError(
                "normalizationCallback is not instance Normalization")

        sn = self.__detectEncoding.encodeText(sourceName)
        sen = self.__detectEncoding.getEncode(sourceName)
        sde = normalizationCallback.getNormalizeTextEncode()
        sdc = datetime.datetime.now()
        normalizeData = normalizationCallback.normalizeText(
            data)  # нормализация полученного текста
        self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde,
                            sdc)  # сохранение
        self.__saveDict(True)  # сохранение словаря, если он что-то содержит

    def makeDocStatisticCustom(self, openSourceCallback, sourceCustom,
                               normalizationCallback):
        if not openSourceCallback or not normalizationCallback or not sourceCustom:
            raise ParamError(
                "openSourceCallback or normalizationCallback or sourceCustom  not to be a None"
            )

        if not isinstance(openSourceCallback, Source):
            raise ParamError("openSourceCallback is not instance Source")

        if not isinstance(sourceCustom, SourceCustom):
            raise ParamError("sourceCustom is not instance SourceCustom")

        if not isinstance(normalizationCallback, Normalization):
            raise ParamError(
                "normalizationCallback is not instance Normalization")

        openSource = openSourceCallback.openSource(sourceCustom.getCustom())
        isSave = True
        for itemData in openSourceCallback.read(openSource):
            try:
                normalizeData = normalizationCallback.normalizeText(
                    itemData)  # нормализация полученного текста
                if isSave:  # добавление нового словарного индекса
                    sn = self.__detectEncoding.encodeText(
                        openSourceCallback.getName(openSource))  # имя
                    sen = self.__detectEncoding.getEncode(sn)
                    ss = openSourceCallback.getSourceSize(
                        openSource)  # размер в kB
                    sde = normalizationCallback.getNormalizeTextEncode()
                    sdc = datetime.datetime.now()
                    self.__makeDocIndex(normalizeData, True, sn, sen, ss, sde,
                                        sdc)  # сохранение
                    isSave = False
                else:  # добавление данных к уже существующему индексу
                    self.__makeDocIndex(normalizeData, False)
            except ParamError:
                pass
        self.__saveDict(True)  # сохранение словаря, если он что-то содержит

        openSourceCallback.closeSource(openSource)

    def makeTotalStatistic(self):
        self.__mongoUtils.mergeDicts()

    def addMoreStatistics(self, calc):
        self.__mongoUtils.addMoreStatistics(calc)

    def getBufferSize(self):
        return self.__bufferSize

    def setBufferSize(self, bufferSize):
        self.__bufferSize = bufferSize

    def getMainStatisticID(self):
        return self.__mongoUtils.getMergeDictID()

    def __makeDocIndex(self,
                       data,
                       createNewDocIndex=False,
                       sn=None,
                       sen=None,
                       ss=0,
                       sde=None,
                       sdc=None):
        """
        Создание индекса по документу

        :param data:  данные для индекса
        :param createNewDocIndex:  создавать ли новый индекс по документу (True - да)
        :param sn:  source name - имя источника  (для создания индекса)
        :param sen:  sourceEncodeName - кодировка имени источника
        :param ss:  source size - размер всех данных источника
        :param sde:  source data encode - кодировка данных источника
        :param sdc:  source data created - дата создания индекса по источнику
        """
        if createNewDocIndex:  # создание нового документа
            self.__bufferDictID = self.__mongoUtils.saveDict(
                sn, sen, ss, {}, sde, sdc)

        for itemWord in data:  # добавление слов в словарь
            if itemWord in self.__bufferDict:
                self.__bufferDict[itemWord] += 1
            else:
                self.__bufferDict[itemWord] = 1

            sizeDict = sys.getsizeof(self.__bufferDict, -1)
            if sizeDict == -1:
                raise TypeError(
                    "Object does not provide means to retrieve the size (see docs)"
                )
            if sizeDict > self.getBufferSize() * 1024:
                self.__saveDict()

    def __saveDict(self, cleanDictID=False):
        """
        Сохранение данных своваря

        :param cleanDictID:  true - удалить сключ словаря
        """
        if self.__bufferDictID and self.__bufferDict:
            self.__mongoUtils.add2Dict(self.__bufferDictID, self.__bufferDict)
            self.__bufferDict = {}  # обновление словаря
            if cleanDictID:  # обновление ключа
                self.__bufferDictID = None

    bufferSize = property(getBufferSize, setBufferSize)