Пример #1
0
    def getWordsDictionary(self, data, full_text = False):
        """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """

        splitter = lambda w, t: [wi.split(t) for wi in w] if type(w) == type([]) else splitter(w,t)

        if full_text:
            # get all words in every cell and then calculate histograms
            words = []
            for cell in data:
                words += splitRecursive(cell, WORD_SEPARATORS)

            hist = {i: words.count(i) for i in words}
            x = list(hist.keys())
            histogram = {
                'x': x,
                'y': list(hist.values())
            }
            return x, histogram


        else:
            hist = {i: data.count(i) for i in data}
            x = list(hist.keys())
            histogram = {
                'x': x,
                'y': list(hist.values())
            }
            return x, histogram
Пример #2
0
def get_text_histogram(data):
    """ If text, returns an array of all the words that appear in the dataset
        and the number of times each word appears in the dataset """
    words = []
    for cell in data:
        words.extend(splitRecursive(cell, WORD_SEPARATORS))

    hist = get_hist(words)
    return hist
Пример #3
0
    def get_words_histogram(data, is_full_text=False):
        """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """

        splitter = lambda w, t: [wi.split(t) for wi in w] if isinstance(
            w, list) else splitter(w, t)

        if is_full_text:
            # get all words in every cell and then calculate histograms
            words = []
            for cell in data:
                words += splitRecursive(cell, WORD_SEPARATORS)

            hist = {i: words.count(i) for i in words}
        else:
            hist = {i: data.count(i) for i in data}

        return {'x': list(hist.keys()), 'y': list(hist.values())}
Пример #4
0
def norm(value, cell_stats):


    if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.NUMERIC:

        if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or (
                value == None or value == '' or value == '\n' or value == '\r')):
            return [0, 0, 0]

        if cell_stats['max'] - cell_stats['min'] != 0:

            normalizedValue = (value - cell_stats['min']) / \
                              (cell_stats['max'] - cell_stats['min'])


        elif cell_stats['max'] != 0:
            normalizedValue = value / cell_stats['max']
        else:
            normalizedValue = value

        # if normalizedValue > 10:
        #     raise ValueError('Something is wrong with normalized value')

        sign = 1 if normalizedValue >= 0 else 0

        normalizedValue = abs(normalizedValue) + OFFSET

        return [normalizedValue, sign, 1.0]

    if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.DATE:
        #[ timestamp, year, month, day, minute, second, is null]
        if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or (
                value == None or value == '' or value == '\n' or value == '\r')):
            ret = [0]*7
            ret[-1] = 0
            return ret

        try:
            timestamp = int(parseDate(value).timestamp())
        except:
            ret = [0] * 7
            ret[-1] = 0
            return ret
        date = datetime.datetime.fromtimestamp(timestamp)
        date_max = datetime.datetime.fromtimestamp(cell_stats['max'])
        date_min = datetime.datetime.fromtimestamp(cell_stats['min'])

        attrs = ['year', 'month', 'day', 'minute', 'second']
        maxes = {'day': 31, 'minute': 60, 'second': 60, 'month': 12}

        norm_vals = []

        if cell_stats['max'] - cell_stats['min'] != 0:
            norm_vals.append( (timestamp - cell_stats['min']) / (cell_stats['max'] - cell_stats['min']) )
        else:
            norm_vals.append( timestamp / cell_stats['max'] )

        for k_attr  in attrs:

            curr = getattr(date, k_attr)
            if k_attr in maxes:
                d_max = maxes[k_attr]
                d_min = 0
            else:
                d_max = getattr(date_max, k_attr)
                d_min = getattr(date_min, k_attr)

            if d_max - d_min !=0:
                norm_vals.append( (curr -d_min)/(d_max-d_min) )
            else:
                norm_vals.append((curr) / (d_max))

        norm_vals.append(1.0)

        return norm_vals

    if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.TEXT:
        # is it a word
        if cell_stats['dictionaryAvailable']:
            # all the words in the dictionary +2 (one for rare words and one for null)
            vector_length = len(cell_stats['dictionary']) + TEXT_ENCODING_EXTRA_LENGTH
            arr = [0] * vector_length
            arr[-1] = 1.0
            if value in [None, '']:
                # return NULL value, which is an empy hot vector array with the last item in list with value 1
                arr[vector_length - 1] = 0  # set null as 1
                return arr

            # else return one hot vector
            # if word is a strange word it will not be in the dictionary
            try:
                index = cell_stats['dictionary'].index(value)
            except:
                index = vector_length - 2

            arr[index] = 1
            return arr

        else:

            return []

    if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.FULL_TEXT:

        if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or (
                value == None or value == '' or value == '\n' or value == '\r')):
            return [FULL_TEXT_NONE_VALUE]

        # is it a full text
        if cell_stats['dictionaryAvailable']:
            # all the words in the dictionary +2 (one for rare words and one for null)
            vector_length = len(cell_stats['dictionary']) + FULL_TEXT_ENCODING_EXTRA_LENGTH


            # else return a list of one hot vectors
            values = splitRecursive(value, WORD_SEPARATORS)
            array_of_arrays = []
            first_word = vector_length - 4

            array_of_arrays += [FULL_TEXT_IS_START]
            for word in values:
                # else return one hot vector
                # if word is a strange word it will not be in the dictionary
                try:
                    index = cell_stats['dictionary'].index(word)
                except:
                    index = FULL_TEXT_UN_FREQUENT
                array_of_arrays += [index]



            array_of_arrays += [FULL_TEXT_IS_END]
            # return [array_of_arrays]
            # TODO: ask about this
            return array_of_arrays

        else:

            return []