Пример #1
0
    def word_count(self, value: Dict[str, int]):
        if type(value) != dict:
            raise InvalidTypeError('word_count', type(value), '<dict>')

        for key, count in value.items():
            if type(key) != int:
                raise InvalidTypeError('word_count: key', type(key), '<int>')
            if type(count) != int:
                raise InvalidTypeError('word_count: count', type(count),
                                       '<int>')
        self._word_count = value
Пример #2
0
    def tfidf(self, value: Dict[str, int]):
        if type(value) != dict:
            raise InvalidTypeError('tfidf', type(value), '<dict>')

        for key, count in value.items():
            if type(key) != str:
                raise InvalidTypeError('tfidf: key', type(key), '<str>')
            if type(count) not in [float, int]:
                raise InvalidTypeError('tfidf: score', type(count),
                                       '<float> or <int>')
        self._tfidf = value
Пример #3
0
    def document_frequency(self, value: Dict[str, int]):
        if type(value) != dict:
            raise InvalidTypeError('document_frequency', type(value), '<dict>')

        for key, count in value.items():
            if type(key) != str:
                raise InvalidTypeError('document_frequency: key', type(key),
                                       '<str>')
            if type(count) != int:
                raise InvalidTypeError('document_frequency: count',
                                       type(count), '<int>')
        self._document_frequency = value
Пример #4
0
 def frequency_count(self, frequency: Dict[str, int]):
     if type(frequency) != dict:
         raise InvalidTypeError('frequency_count', type(frequency),
                                '<dict>')
     for key, value in self._frequency_count.items():
         if type(key) not in [str, int]:
             raise InvalidTypeError('frequency_count:key', type(key),
                                    '<str> or <int>')
         if type(value) != int:
             raise InvalidTypeError('frequency_count:value', type(value),
                                    '<int>')
     self._frequency_count = frequency
     self._total_count = sum(list(self._frequency_count.values()))
Пример #5
0
    def __init__(self, preprocess_fn: Optional[Callable[[str], str]] = None,
                 predefined_pattern: Optional[Dict[str, str]] = {},
                 tokenizer: Optional[Callable[[str], List[str]]] = word_tokenize,
                 stop_words: Optional[Set] = None,
                 stop_words_by_languages: Optional[List[str]] = None,
                 tf_type: Optional = TermFrequencyType.TF_ABSOLUTE):
        """
        Initialize TextDataAnalyzer

        Args:
            preprocess_fn: the function that pre-processes the text, returns the processed text
            predefined_pattern: the dictionary maps a pattern name to its regex string
            tokenizer: the function tokenize a text into a list of tokens, the default tokenizer is the `word_tokenize`
                       in nltk.tokenize
            stop_words: the set of stop words that will be ignored in the final stats
            stop_words_by_languages: a list of language code (from nltk.corpus) to determine the stop words set by languages.
                                     Supported by `nltk.corpus.stopwords`. If `stop_words` is not None, it will be ignored.
            tf_type: how the term frequency is calculated, default is
                    - TF_ABSOLUTE: the occurrence of t in document d, i.e. f(t,d)
                    - TF_BOOLEAN: 1 if term appeared in the document, otherwise 0
                    - TF_NORMALIZE_BY_MAX: f(t,d) normalized by the maximum term frequency
                    - TF_NORMALIZE_BY_DOC: f(t,d) normalized by the total number of terms in the document
                    - TF_LOGARITHM: log (1+ f(t,d))
                    - TF_AUGMENTED: 0.5 + 0.5 * (f(t,d) / max(f(t',d)) for t' in d)
        """
        super(TextDataAnalyzer, self).__init__()

        def processing(text):
            if preprocess_fn is not None:
                text = preprocess_fn(text)
            pattern_count = dict()
            for pattern_name, pattern_regex in predefined_pattern.items():
                pattern_count[pattern_name] = len(re.findall(pattern_regex, text, re.MULTILINE))
            tokens = tokenizer(text)
            return tokens, pattern_count, text

        self.preprocessor = processing

        if stop_words is None:
            stop_words = set()
            if stop_words_by_languages is not None:
                for lang in stop_words_by_languages:
                    stop_words.update(stopwords.words(lang))
        self.stop_words = frozenset(stop_words)

        self._total_count = 0
        self._pattern_occurrence_counter = defaultdict(int)
        self._pattern_document_counter = defaultdict(int)
        self._word_counter = defaultdict(int)
        self._character_counter = defaultdict(int)
        self._absolute_term_frequency = Counter()
        self._term_frequency = Counter()
        self._document_frequency = Counter()
        if tf_type not in [TermFrequencyType.TF_ABSOLUTE, TermFrequencyType.TF_BOOLEAN,
                           TermFrequencyType.TF_NORMALIZED_BY_DOC, TermFrequencyType.TF_NORMALIZED_BY_MAX,
                           TermFrequencyType.TF_LOGARITHM, TermFrequencyType.TF_AUGMENTED]:
            raise InvalidTypeError(tf_type, 'tf_type', '<one of the TermFrequencyType>')
        self.tf_type = tf_type
Пример #6
0
    def kde(self, value: List[Tuple[Union[float, int, None], Union[float, int,
                                                                   None]]]):
        if not isinstance(value, list):
            raise InvalidTypeError('kde', type(value), '<list>')

        for item in value:
            if type(item) != tuple:
                raise InvalidTypeError('kde: point', type(item), '<tuple>')
            if len(item) != 2:
                raise InvalidSizeError('kde: point', len(item), 2)
            if not isinstance(item[0], float) and not isinstance(item[0], int):
                raise InvalidTypeError('kde: point: x', type(item[0]),
                                       '<int> or <float>')
            if not isinstance(item[0], float) and not isinstance(item[1], int):
                raise InvalidTypeError('kde: point: y', type(item[1]),
                                       '<int> or <float>')

        self._kde = value
Пример #7
0
        def __check_dict(dict_item):
            if type(dict_item) != dict:
                self._total_count = 0
                raise InvalidTypeError('frequency_count', type(dict_item), '<dict>')
            height_list = []
            for key, value in dict_item.items():
                if type(key) not in [str, int]:
                    self._total_count = 0
                    raise InvalidTypeError('frequency_count:key', type(key), '<str> or <int>')
                if type(value) not in [dict, int]:
                    self._total_count = 0
                    raise InvalidTypeError('frequency_count:value', type(value), '<int> or <dict>')

                if type(value) == dict:
                    height = __check_dict(value)
                    height_list.append(height)

                if type(value) == int:
                    self._total_count += value
                    height_list.append(0)
            return max(height_list) + 1
Пример #8
0
    def pattern_stats(self, value: Dict[str, int]):
        if type(value) != dict:
            raise InvalidTypeError('pattern_stats', type(value), '<dict>')

        for name, count in value.items():
            if type(name) != str:
                raise InvalidTypeError('pattern_stats: key', type(name),
                                       '<str>')
            if type(count) != tuple:
                raise InvalidTypeError('pattern_stats: count', type(count),
                                       '<tuple>')
            if len(count) != 2:
                raise InvalidSizeError('pattern_stats: count', len(count), 2)
            if type(count[0]) != int:
                raise InvalidTypeError('pattern_stats: count: term_count',
                                       type(count[0]), '<int>')
            if type(count[1]) != int:
                raise InvalidTypeError('pattern_stats: count: document_count',
                                       type(count[1]), '<int>')

        self._pattern_stats = value
Пример #9
0
    def histogram(self, value: List[Tuple[Union[float, int, None],
                                          Union[float, int, None], int]]):
        if not isinstance(value, list):
            raise InvalidTypeError('histogram', type(value), '<list>')

        for item in value:
            if not isinstance(value, list):
                raise InvalidTypeError('histogram: bin', type(item), '<tuple>')
            if len(item) != 3:
                raise InvalidSizeError('histogram: bin', len(item), 3)
            if not isinstance(item[0], float) and not isinstance(item[0], int):
                raise InvalidTypeError('histogram: bin: bin_edge_left',
                                       type(item[0]), '<int> or <float>')
            if not isinstance(item[1], float) and not isinstance(item[1], int):
                raise InvalidTypeError('histogram: bin: bin_edge_right',
                                       type(item[1]), '<int> or <float>')
            if not isinstance(item[2], int):
                raise InvalidTypeError('histogram: bin: bin_edge_count',
                                       type(item[2]), '<int>')

        self._histogram = value
        self.total_count = sum([item[2] for item in self._histogram])
Пример #10
0
    def feed(self, value, label):
        """
        Feed sequence values into analyzer and aggregate the stats

        Args:
            value: a list of items
            label: label associated with the value

        """
        if type(value) != list:
            raise InvalidTypeError('value', type(value), '<list>')

        for item in value:
            self.analyzer.feed(value=item, label=label)
Пример #11
0
    def __init__(self,
                 data_type_list: List,
                 column_names: List = None,
                 sequence_names: List = None):
        """
        Initialize data analyzer suite

        Args:
            data_type_list: list, a list of pre-defined data type.
                            If column_names is not provided, data_type_list should for all the columns.
            column_names: list, a list of column names.
            sequence_names: list, a list of feature names that is considered sequence data.
        """
        if column_names is not None:
            if type(column_names) == list:
                if len(column_names) != len(data_type_list):
                    raise InconsistentSize('data_type_list', 'column_name',
                                           len(column_names),
                                           len(data_type_list))
            else:
                raise InvalidTypeError(data_type_list, type(data_type_list),
                                       '<list>')

        else:
            column_names = list(range(len(data_type_list)))

        self.schema = dict(zip(column_names, data_type_list))

        self.analyzers = dict()

        if sequence_names is None:
            sequence_names = []

        for key, data_type in self.schema.items():
            if data_type == DATATYPE.CATEGORY:
                analyzer = LabelledCategoricalDataAnalyzer()
            elif data_type == DATATYPE.NUMBER:
                analyzer = LabelledNumericalDataAnalyzer()
            elif data_type == DATATYPE.FREETEXT:
                analyzer = LabelledTextDataAnalyzer()
            elif data_type == DATATYPE.DATETIME:
                analyzer = LabelledDatetimeDataAnalyzer()
            else:
                raise AnalyzerDataTypeNotSupported(data_type)

            if key in sequence_names:
                self.analyzers[key] = SequenceAnalyzer(analyzer=analyzer)
            else:
                self.analyzers[key] = analyzer
Пример #12
0
 def total_count(self, value: int):
     if not isinstance(value, int):
         raise InvalidTypeError('total_count', type(value), '<int>')
     self._total_count = value
Пример #13
0
 def median(self, value: Union[float, int, None]):
     if not isinstance(value, float) and not isinstance(
             value, int) and value is not None:
         raise InvalidTypeError('median', type(value),
                                '<int> or <float> or None')
     self._median = value
Пример #14
0
 def resolution_list(self, resolution_list: List[str]):
     if type(resolution_list) != list:
         raise InvalidTypeError('resolution_list', type(resolution_list), '<list>')
     for value in resolution_list:
         if type(value) != int:
             raise InvalidTypeError('resolution_list: item', type(value), '<int>')
Пример #15
0
 def total_count(self, value: int):
     if type(value) != int:
         raise InvalidTypeError('total_count', type(value), '<int>')
     self._total_count = value