def word_count(self, value: Dict[str, int]): if type(value) != dict: raise InvalidTypeError('word_count', type(value), '<dict>') for key, count in value.items(): if type(key) != int: raise InvalidTypeError('word_count: key', type(key), '<int>') if type(count) != int: raise InvalidTypeError('word_count: count', type(count), '<int>') self._word_count = value
def tfidf(self, value: Dict[str, int]): if type(value) != dict: raise InvalidTypeError('tfidf', type(value), '<dict>') for key, count in value.items(): if type(key) != str: raise InvalidTypeError('tfidf: key', type(key), '<str>') if type(count) not in [float, int]: raise InvalidTypeError('tfidf: score', type(count), '<float> or <int>') self._tfidf = value
def document_frequency(self, value: Dict[str, int]): if type(value) != dict: raise InvalidTypeError('document_frequency', type(value), '<dict>') for key, count in value.items(): if type(key) != str: raise InvalidTypeError('document_frequency: key', type(key), '<str>') if type(count) != int: raise InvalidTypeError('document_frequency: count', type(count), '<int>') self._document_frequency = value
def frequency_count(self, frequency: Dict[str, int]): if type(frequency) != dict: raise InvalidTypeError('frequency_count', type(frequency), '<dict>') for key, value in self._frequency_count.items(): if type(key) not in [str, int]: raise InvalidTypeError('frequency_count:key', type(key), '<str> or <int>') if type(value) != int: raise InvalidTypeError('frequency_count:value', type(value), '<int>') self._frequency_count = frequency self._total_count = sum(list(self._frequency_count.values()))
def __init__(self, preprocess_fn: Optional[Callable[[str], str]] = None, predefined_pattern: Optional[Dict[str, str]] = {}, tokenizer: Optional[Callable[[str], List[str]]] = word_tokenize, stop_words: Optional[Set] = None, stop_words_by_languages: Optional[List[str]] = None, tf_type: Optional = TermFrequencyType.TF_ABSOLUTE): """ Initialize TextDataAnalyzer Args: preprocess_fn: the function that pre-processes the text, returns the processed text predefined_pattern: the dictionary maps a pattern name to its regex string tokenizer: the function tokenize a text into a list of tokens, the default tokenizer is the `word_tokenize` in nltk.tokenize stop_words: the set of stop words that will be ignored in the final stats stop_words_by_languages: a list of language code (from nltk.corpus) to determine the stop words set by languages. Supported by `nltk.corpus.stopwords`. If `stop_words` is not None, it will be ignored. tf_type: how the term frequency is calculated, default is - TF_ABSOLUTE: the occurrence of t in document d, i.e. f(t,d) - TF_BOOLEAN: 1 if term appeared in the document, otherwise 0 - TF_NORMALIZE_BY_MAX: f(t,d) normalized by the maximum term frequency - TF_NORMALIZE_BY_DOC: f(t,d) normalized by the total number of terms in the document - TF_LOGARITHM: log (1+ f(t,d)) - TF_AUGMENTED: 0.5 + 0.5 * (f(t,d) / max(f(t',d)) for t' in d) """ super(TextDataAnalyzer, self).__init__() def processing(text): if preprocess_fn is not None: text = preprocess_fn(text) pattern_count = dict() for pattern_name, pattern_regex in predefined_pattern.items(): pattern_count[pattern_name] = len(re.findall(pattern_regex, text, re.MULTILINE)) tokens = tokenizer(text) return tokens, pattern_count, text self.preprocessor = processing if stop_words is None: stop_words = set() if stop_words_by_languages is not None: for lang in stop_words_by_languages: stop_words.update(stopwords.words(lang)) self.stop_words = frozenset(stop_words) self._total_count = 0 self._pattern_occurrence_counter = defaultdict(int) self._pattern_document_counter = defaultdict(int) self._word_counter = defaultdict(int) self._character_counter = defaultdict(int) self._absolute_term_frequency = Counter() self._term_frequency = Counter() self._document_frequency = Counter() if tf_type not in [TermFrequencyType.TF_ABSOLUTE, TermFrequencyType.TF_BOOLEAN, TermFrequencyType.TF_NORMALIZED_BY_DOC, TermFrequencyType.TF_NORMALIZED_BY_MAX, TermFrequencyType.TF_LOGARITHM, TermFrequencyType.TF_AUGMENTED]: raise InvalidTypeError(tf_type, 'tf_type', '<one of the TermFrequencyType>') self.tf_type = tf_type
def kde(self, value: List[Tuple[Union[float, int, None], Union[float, int, None]]]): if not isinstance(value, list): raise InvalidTypeError('kde', type(value), '<list>') for item in value: if type(item) != tuple: raise InvalidTypeError('kde: point', type(item), '<tuple>') if len(item) != 2: raise InvalidSizeError('kde: point', len(item), 2) if not isinstance(item[0], float) and not isinstance(item[0], int): raise InvalidTypeError('kde: point: x', type(item[0]), '<int> or <float>') if not isinstance(item[0], float) and not isinstance(item[1], int): raise InvalidTypeError('kde: point: y', type(item[1]), '<int> or <float>') self._kde = value
def __check_dict(dict_item): if type(dict_item) != dict: self._total_count = 0 raise InvalidTypeError('frequency_count', type(dict_item), '<dict>') height_list = [] for key, value in dict_item.items(): if type(key) not in [str, int]: self._total_count = 0 raise InvalidTypeError('frequency_count:key', type(key), '<str> or <int>') if type(value) not in [dict, int]: self._total_count = 0 raise InvalidTypeError('frequency_count:value', type(value), '<int> or <dict>') if type(value) == dict: height = __check_dict(value) height_list.append(height) if type(value) == int: self._total_count += value height_list.append(0) return max(height_list) + 1
def pattern_stats(self, value: Dict[str, int]): if type(value) != dict: raise InvalidTypeError('pattern_stats', type(value), '<dict>') for name, count in value.items(): if type(name) != str: raise InvalidTypeError('pattern_stats: key', type(name), '<str>') if type(count) != tuple: raise InvalidTypeError('pattern_stats: count', type(count), '<tuple>') if len(count) != 2: raise InvalidSizeError('pattern_stats: count', len(count), 2) if type(count[0]) != int: raise InvalidTypeError('pattern_stats: count: term_count', type(count[0]), '<int>') if type(count[1]) != int: raise InvalidTypeError('pattern_stats: count: document_count', type(count[1]), '<int>') self._pattern_stats = value
def histogram(self, value: List[Tuple[Union[float, int, None], Union[float, int, None], int]]): if not isinstance(value, list): raise InvalidTypeError('histogram', type(value), '<list>') for item in value: if not isinstance(value, list): raise InvalidTypeError('histogram: bin', type(item), '<tuple>') if len(item) != 3: raise InvalidSizeError('histogram: bin', len(item), 3) if not isinstance(item[0], float) and not isinstance(item[0], int): raise InvalidTypeError('histogram: bin: bin_edge_left', type(item[0]), '<int> or <float>') if not isinstance(item[1], float) and not isinstance(item[1], int): raise InvalidTypeError('histogram: bin: bin_edge_right', type(item[1]), '<int> or <float>') if not isinstance(item[2], int): raise InvalidTypeError('histogram: bin: bin_edge_count', type(item[2]), '<int>') self._histogram = value self.total_count = sum([item[2] for item in self._histogram])
def feed(self, value, label): """ Feed sequence values into analyzer and aggregate the stats Args: value: a list of items label: label associated with the value """ if type(value) != list: raise InvalidTypeError('value', type(value), '<list>') for item in value: self.analyzer.feed(value=item, label=label)
def __init__(self, data_type_list: List, column_names: List = None, sequence_names: List = None): """ Initialize data analyzer suite Args: data_type_list: list, a list of pre-defined data type. If column_names is not provided, data_type_list should for all the columns. column_names: list, a list of column names. sequence_names: list, a list of feature names that is considered sequence data. """ if column_names is not None: if type(column_names) == list: if len(column_names) != len(data_type_list): raise InconsistentSize('data_type_list', 'column_name', len(column_names), len(data_type_list)) else: raise InvalidTypeError(data_type_list, type(data_type_list), '<list>') else: column_names = list(range(len(data_type_list))) self.schema = dict(zip(column_names, data_type_list)) self.analyzers = dict() if sequence_names is None: sequence_names = [] for key, data_type in self.schema.items(): if data_type == DATATYPE.CATEGORY: analyzer = LabelledCategoricalDataAnalyzer() elif data_type == DATATYPE.NUMBER: analyzer = LabelledNumericalDataAnalyzer() elif data_type == DATATYPE.FREETEXT: analyzer = LabelledTextDataAnalyzer() elif data_type == DATATYPE.DATETIME: analyzer = LabelledDatetimeDataAnalyzer() else: raise AnalyzerDataTypeNotSupported(data_type) if key in sequence_names: self.analyzers[key] = SequenceAnalyzer(analyzer=analyzer) else: self.analyzers[key] = analyzer
def total_count(self, value: int): if not isinstance(value, int): raise InvalidTypeError('total_count', type(value), '<int>') self._total_count = value
def median(self, value: Union[float, int, None]): if not isinstance(value, float) and not isinstance( value, int) and value is not None: raise InvalidTypeError('median', type(value), '<int> or <float> or None') self._median = value
def resolution_list(self, resolution_list: List[str]): if type(resolution_list) != list: raise InvalidTypeError('resolution_list', type(resolution_list), '<list>') for value in resolution_list: if type(value) != int: raise InvalidTypeError('resolution_list: item', type(value), '<int>')
def total_count(self, value: int): if type(value) != int: raise InvalidTypeError('total_count', type(value), '<int>') self._total_count = value