Exemplos de find_1D em Python, exemplos de numpy_utils.find_1D em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: feature_extraction_new_3.py Projeto: dhvfanny/Ali2015-MobileRecommendation

    def extract_features_5(self, data, iid, time_thresh=None, time_start=0):
        """
        ITEM level.
        Get feature vector from one user's behaviours of one item.
        The structure of 'data' is a list of [behavior_type, user_geohash, time]

        Parameters
        ----------
        data : data dict. Structure: [[behavior_type, user_geohash, time]]
        iid : item id.
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D numpy feature vector. This function extracts the following features:
            max same prefix between user and item
            normalized (max same prefix between user and item)
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            #4/3
            #4/2
            #4/1
            time between last behaviour to pred day
            time between last behaviour 1 to pred day
            time between last behaviour 2 to pred day
            time between last behaviour 3 to pred day
            time between last behaviour 4 to pred day
            time between first behaviour to pred day
            time between first behaviour 1 to pred day
            time between first behaviour 2 to pred day
            time between first behaviour 3 to pred day
            time between first behaviour 4 to pred day
            overall duration
            behaviour 1 duration
            behaviour 2 duration
            behaviour 3 duration
            behaviour 4 duration
            #log(t+1) for all above

            Total dimensions: 2 * (4 + 5 + 1) + 2

        """

        if time_thresh is None:
            time_thresh = self.TIME_THRESH

        #dim = 40
        #feature = np.zeros(dim, dtype=np.float)
        feature = []
        user_geos = np.array([data[i][1] for i in xrange(len(data)) if data[i][1] != ''])
        behaviors = np.array([data[i][0] for i in xrange(len(data))])
        times = np.array([data[i][2] for i in xrange(len(data))])

        b_1 = find_1D(behaviors == 1)
        b_2 = find_1D(behaviors == 2)
        b_3 = find_1D(behaviors == 3)
        b_4 = find_1D(behaviors == 4)

        #item_unique = np.unique(item_ids)

        # feature[0] = b_1.shape[0]
        # feature[1] = b_2.shape[0]
        # feature[2] = b_3.shape[0]
        # feature[3] = b_4.shape[0]
        feature += [b_1.shape[0], b_2.shape[0], b_3.shape[0], b_4.shape[0]]
        #feature += [float(b_4.shape[0])/float(b_1.shape[0] + self.small), float(b_4.shape[0])/float(b_2.shape[0] + self.small), float(b_4.shape[0])/float(b_3.shape[0] + self.small)]

        def check_time(times, behavior, type='max'):
            if behavior.shape[0] == 0:
                return time_start
            else:
                if type == 'max':
                    return np.max(times[behavior])
                elif type == 'min': 
                    return np.min(times[behavior])
                elif type == 'duration':
                    return np.max(times[behavior]) - np.min(times[behavior])

        # feature[4] = time_thresh - np.max(times)
        # feature[5] = time_thresh - check_time(times, b_1)
        # feature[6] = time_thresh - check_time(times, b_2)
        # feature[7] = time_thresh - check_time(times, b_3)
        # feature[8] = time_thresh - check_time(times, b_4)
        feature += [time_thresh - np.max(times), time_thresh - check_time(times, b_1), time_thresh - check_time(times, b_2), time_thresh - check_time(times, b_3), time_thresh - check_time(times, b_4)]

        # feature[9] = time_thresh - np.min(times)
        # feature[10] = time_thresh - check_time(times, b_1, type='min')
        # feature[11] = time_thresh - check_time(times, b_2, type='min')
        # feature[12] = time_thresh - check_time(times, b_3, type='min')
        # feature[13] = time_thresh - check_time(times, b_4, type='min')
        feature += [time_thresh - np.min(times), time_thresh - check_time(times, b_1, type='min'), time_thresh - check_time(times, b_2, type='min'), time_thresh - check_time(times, b_3, type='min'), time_thresh - check_time(times, b_4, type='min')]
    
        # feature[14] = np.max(times) - np.min(times)
        # feature[15] = check_time(times, b_1, type='duration')
        # feature[16] = check_time(times, b_2, type='duration')
        # feature[17] = check_time(times, b_3, type='duration')
        # feature[18] = check_time(times, b_4, type='duration')
        feature += [np.max(times) - np.min(times), check_time(times, b_1, type='duration'), check_time(times, b_2, type='duration'), check_time(times, b_3, type='duration'), check_time(times, b_4, type='duration')]

        #feature[19:38] = np.log(feature[0:19] + 2)

        dist = -1
        dist_norm = -1
        if iid in self.item_geos:
            for i_geo in self.item_geos[iid]:
                for u_geo in user_geos:
                    dist = max(dist, self.geo_dist(i_geo, u_geo))
                    dist_norm = max(dist_norm, self.geo_dist(i_geo, u_geo, normalize=True))
        
        # feature[38] = dist
        # feature[39] = dist_norm
        feature = [dist, dist_norm] + feature

        #global id
        #id += 1
        #if id%100000 == 0:
        #    print id

        return feature

Exemplo n.º 2

0

Exibir arquivo

Arquivo: feature_extraction_new_3.py Projeto: dhvfanny/Ali2015-MobileRecommendation

    def  extract_features_3(self, data, time_thresh=None, time_start=0):
        """
        CAT level.
        Get feature vector from one user's behaviours of one category.
        #The structure of 'data' is a dict {item_id: [behavior_type, user_geohash, time]}

        Parameters
        ----------
        data : data dict. Structure: {iid: [[behavior_type, user_geohash, time]]}
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D numpy feature vector. This function extracts the following features:
            total number of different items
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            4/3
            4/2
            4/1
            \ mean 4/i for all items
            \ median 4/i for all items
            #\ max 4/i for all items
            #\ min 4/i for all items
            count of items with 2
            count of items with 3
            count of items with 4
            number of items with (4/3/2/1) / number of items without (4/3/2/1)
            \ mean number of behaviours i for all item
            \ mean number of behaviours i for the items with i
            \ median number of behaviours i for the items with i
            \ max number of behaviours i for the items with i #------------------------- useful???
            #\ min number of behaviours i for the items with i
            \ mean time between last behaviour to pred day for all item
            \ max time between last behaviour to pred day for all item #--------------------- min???
            \ min time between last behaviour to pred day for all item
            \ time between last behaviour 1 to pred day
            \ time between last behaviour 2 to pred day
            \ time between last behaviour 3 to pred day
            \ time between last behaviour 4 to pred day
            \ mean overall duration
            \ median overall duration
            \ max overall duration
            #\ min overall duration
            #\ min time between last behaviour to pred day for all item
            #total number of items
            #log(t+1) for all above

            Total dimensions: 2 * (1 + 4 + 3 + 12 + 3)

        dict {iid: [1-D feature vector], ...}
            count of items with behaviours 2 except iid
            count of items with behaviours 3 except iid
            count of items with behaviours 4 except iid
            average number of behaviours 1 except iid
            average number of behaviours 2 except iid
            average number of behaviours 3 except iid
            average number of behaviours 4 except iid
            min time between last behaviour 1 to pred day except iid
            min time between last behaviour 2 to pred day except iid
            min time between last behaviour 3 to pred day except iid
            min time between last behaviour 4 to pred day except iid
            min time between last behaviour to pred day except iid
            #4/1 except cid
            #4/2 except cid
            #4/3 except cid

        """
        if time_thresh is None:
            time_thresh = self.TIME_THRESH
        raw_data = []
        iid_dict = {}
        for iid in data:
            if iid > 0:
                temp = [[iid] + entry for entry in data[iid]]
                raw_data += temp
                iid_dict[iid] = [0.0, 0.0, 0.0, 0.0]
                for entry in data[iid]:
                    if entry[0] == 1:
                        iid_dict[iid][0] += 1
                    elif entry[0] == 2:
                        iid_dict[iid][1] += 1
                    elif entry[0] == 3:
                        iid_dict[iid][2] += 1
                    elif entry[0] == 4:
                        iid_dict[iid][3] += 1
        data = raw_data

        #dim = 46

        #feature = np.zeros(dim, dtype=np.float)
        #feature = [0 for i in xrange(dim)]
        feature = []
        item_ids = np.array([data[i][0] for i in xrange(len(data))])
        behaviors = np.array([data[i][1] for i in xrange(len(data))])
        times = np.array([data[i][3] for i in xrange(len(data))])

        b_1 = find_1D(behaviors == 1)
        b_2 = find_1D(behaviors == 2)
        b_3 = find_1D(behaviors == 3)
        b_4 = find_1D(behaviors == 4)

        item_unique = np.unique(item_ids)

        #feature[0] = item_unique.shape[0]
        feature.append(item_unique.shape[0])

        # feature[1] = b_1.shape[0]
        # feature[2] = b_2.shape[0]
        # feature[3] = b_3.shape[0]
        # feature[4] = b_4.shape[0]
        feature += [b_1.shape[0], b_2.shape[0], b_3.shape[0], b_4.shape[0]]
        feature += [self.new_divide(b_4.shape[0], b_1.shape[0]), self.new_divide(b_4.shape[0], b_2.shape[0]), self.new_divide(b_4.shape[0], b_3.shape[0])]

        iid_ratio = np.array([[self.new_divide(entry[3], entry[0]), self.new_divide(entry[3], entry[1]), self.new_divide(entry[3], entry[2])] for entry in iid_dict.values() if entry[3] != 0])
        if iid_ratio.shape == (0L,):
            iid_ratio = np.array([[0.0, 0.0, 0.0]])

Exemplo n.º 3

0

Exibir arquivo

Arquivo: feature_extraction_new_3.py Projeto: yluibe/Ali2015-MobileRecommendation

    def extract_features_3(self, data, time_thresh=None, time_start=0):
        """
        CAT level.
        Get feature vector from one user's behaviours of one category.
        #The structure of 'data' is a dict {item_id: [behavior_type, user_geohash, time]}

        Parameters
        ----------
        data : data dict. Structure: {iid: [[behavior_type, user_geohash, time]]}
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D numpy feature vector. This function extracts the following features:
            total number of different items
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            4/3
            4/2
            4/1
            \ mean 4/i for all items
            \ median 4/i for all items
            #\ max 4/i for all items
            #\ min 4/i for all items
            count of items with 2
            count of items with 3
            count of items with 4
            number of items with (4/3/2/1) / number of items without (4/3/2/1)
            \ mean number of behaviours i for all item
            \ mean number of behaviours i for the items with i
            \ median number of behaviours i for the items with i
            \ max number of behaviours i for the items with i #------------------------- useful???
            #\ min number of behaviours i for the items with i
            \ mean time between last behaviour to pred day for all item
            \ max time between last behaviour to pred day for all item #--------------------- min???
            \ min time between last behaviour to pred day for all item
            \ time between last behaviour 1 to pred day
            \ time between last behaviour 2 to pred day
            \ time between last behaviour 3 to pred day
            \ time between last behaviour 4 to pred day
            \ mean overall duration
            \ median overall duration
            \ max overall duration
            #\ min overall duration
            #\ min time between last behaviour to pred day for all item
            #total number of items
            #log(t+1) for all above

            Total dimensions: 2 * (1 + 4 + 3 + 12 + 3)

        dict {iid: [1-D feature vector], ...}
            count of items with behaviours 2 except iid
            count of items with behaviours 3 except iid
            count of items with behaviours 4 except iid
            average number of behaviours 1 except iid
            average number of behaviours 2 except iid
            average number of behaviours 3 except iid
            average number of behaviours 4 except iid
            min time between last behaviour 1 to pred day except iid
            min time between last behaviour 2 to pred day except iid
            min time between last behaviour 3 to pred day except iid
            min time between last behaviour 4 to pred day except iid
            min time between last behaviour to pred day except iid
            #4/1 except cid
            #4/2 except cid
            #4/3 except cid

        """
        if time_thresh is None:
            time_thresh = self.TIME_THRESH
        raw_data = []
        iid_dict = {}
        for iid in data:
            if iid > 0:
                temp = [[iid] + entry for entry in data[iid]]
                raw_data += temp
                iid_dict[iid] = [0.0, 0.0, 0.0, 0.0]
                for entry in data[iid]:
                    if entry[0] == 1:
                        iid_dict[iid][0] += 1
                    elif entry[0] == 2:
                        iid_dict[iid][1] += 1
                    elif entry[0] == 3:
                        iid_dict[iid][2] += 1
                    elif entry[0] == 4:
                        iid_dict[iid][3] += 1
        data = raw_data

        #dim = 46

        #feature = np.zeros(dim, dtype=np.float)
        #feature = [0 for i in xrange(dim)]
        feature = []
        item_ids = np.array([data[i][0] for i in xrange(len(data))])
        behaviors = np.array([data[i][1] for i in xrange(len(data))])
        times = np.array([data[i][3] for i in xrange(len(data))])

        b_1 = find_1D(behaviors == 1)
        b_2 = find_1D(behaviors == 2)
        b_3 = find_1D(behaviors == 3)
        b_4 = find_1D(behaviors == 4)

        item_unique = np.unique(item_ids)

        #feature[0] = item_unique.shape[0]
        feature.append(item_unique.shape[0])

        # feature[1] = b_1.shape[0]
        # feature[2] = b_2.shape[0]
        # feature[3] = b_3.shape[0]
        # feature[4] = b_4.shape[0]
        feature += [b_1.shape[0], b_2.shape[0], b_3.shape[0], b_4.shape[0]]
        feature += [
            self.new_divide(b_4.shape[0], b_1.shape[0]),
            self.new_divide(b_4.shape[0], b_2.shape[0]),
            self.new_divide(b_4.shape[0], b_3.shape[0])
        ]

        iid_ratio = np.array([[
            self.new_divide(entry[3], entry[0]),
            self.new_divide(entry[3], entry[1]),
            self.new_divide(entry[3], entry[2])
        ] for entry in iid_dict.values() if entry[3] != 0])
        if iid_ratio.shape == (0L, ):
            iid_ratio = np.array([[0.0, 0.0, 0.0]])

Exemplo n.º 4

0

Exibir arquivo

Arquivo: feature_extraction_new_3.py Projeto: yluibe/Ali2015-MobileRecommendation

    def extract_features_5(self, data, iid, time_thresh=None, time_start=0):
        """
        ITEM level.
        Get feature vector from one user's behaviours of one item.
        The structure of 'data' is a list of [behavior_type, user_geohash, time]

        Parameters
        ----------
        data : data dict. Structure: [[behavior_type, user_geohash, time]]
        iid : item id.
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D numpy feature vector. This function extracts the following features:
            max same prefix between user and item
            normalized (max same prefix between user and item)
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            #4/3
            #4/2
            #4/1
            time between last behaviour to pred day
            time between last behaviour 1 to pred day
            time between last behaviour 2 to pred day
            time between last behaviour 3 to pred day
            time between last behaviour 4 to pred day
            time between first behaviour to pred day
            time between first behaviour 1 to pred day
            time between first behaviour 2 to pred day
            time between first behaviour 3 to pred day
            time between first behaviour 4 to pred day
            overall duration
            behaviour 1 duration
            behaviour 2 duration
            behaviour 3 duration
            behaviour 4 duration
            #log(t+1) for all above

            Total dimensions: 2 * (4 + 5 + 1) + 2

        """

        if time_thresh is None:
            time_thresh = self.TIME_THRESH

        #dim = 40
        #feature = np.zeros(dim, dtype=np.float)
        feature = []
        user_geos = np.array(
            [data[i][1] for i in xrange(len(data)) if data[i][1] != ''])
        behaviors = np.array([data[i][0] for i in xrange(len(data))])
        times = np.array([data[i][2] for i in xrange(len(data))])

        b_1 = find_1D(behaviors == 1)
        b_2 = find_1D(behaviors == 2)
        b_3 = find_1D(behaviors == 3)
        b_4 = find_1D(behaviors == 4)

        #item_unique = np.unique(item_ids)

        # feature[0] = b_1.shape[0]
        # feature[1] = b_2.shape[0]
        # feature[2] = b_3.shape[0]
        # feature[3] = b_4.shape[0]
        feature += [b_1.shape[0], b_2.shape[0], b_3.shape[0], b_4.shape[0]]

        #feature += [float(b_4.shape[0])/float(b_1.shape[0] + self.small), float(b_4.shape[0])/float(b_2.shape[0] + self.small), float(b_4.shape[0])/float(b_3.shape[0] + self.small)]

        def check_time(times, behavior, type='max'):
            if behavior.shape[0] == 0:
                return time_start
            else:
                if type == 'max':
                    return np.max(times[behavior])
                elif type == 'min':
                    return np.min(times[behavior])
                elif type == 'duration':
                    return np.max(times[behavior]) - np.min(times[behavior])

        # feature[4] = time_thresh - np.max(times)
        # feature[5] = time_thresh - check_time(times, b_1)
        # feature[6] = time_thresh - check_time(times, b_2)
        # feature[7] = time_thresh - check_time(times, b_3)
        # feature[8] = time_thresh - check_time(times, b_4)
        feature += [
            time_thresh - np.max(times), time_thresh - check_time(times, b_1),
            time_thresh - check_time(times, b_2),
            time_thresh - check_time(times, b_3),
            time_thresh - check_time(times, b_4)
        ]

        # feature[9] = time_thresh - np.min(times)
        # feature[10] = time_thresh - check_time(times, b_1, type='min')
        # feature[11] = time_thresh - check_time(times, b_2, type='min')
        # feature[12] = time_thresh - check_time(times, b_3, type='min')
        # feature[13] = time_thresh - check_time(times, b_4, type='min')
        feature += [
            time_thresh - np.min(times),
            time_thresh - check_time(times, b_1, type='min'),
            time_thresh - check_time(times, b_2, type='min'),
            time_thresh - check_time(times, b_3, type='min'),
            time_thresh - check_time(times, b_4, type='min')
        ]

        # feature[14] = np.max(times) - np.min(times)
        # feature[15] = check_time(times, b_1, type='duration')
        # feature[16] = check_time(times, b_2, type='duration')
        # feature[17] = check_time(times, b_3, type='duration')
        # feature[18] = check_time(times, b_4, type='duration')
        feature += [
            np.max(times) - np.min(times),
            check_time(times, b_1, type='duration'),
            check_time(times, b_2, type='duration'),
            check_time(times, b_3, type='duration'),
            check_time(times, b_4, type='duration')
        ]

        #feature[19:38] = np.log(feature[0:19] + 2)

        dist = -1
        dist_norm = -1
        if iid in self.item_geos:
            for i_geo in self.item_geos[iid]:
                for u_geo in user_geos:
                    dist = max(dist, self.geo_dist(i_geo, u_geo))
                    dist_norm = max(
                        dist_norm, self.geo_dist(i_geo, u_geo, normalize=True))

        # feature[38] = dist
        # feature[39] = dist_norm
        feature = [dist, dist_norm] + feature

        #global id
        #id += 1
        #if id%100000 == 0:
        #    print id

        return feature

Exemplo n.º 5

0

Exibir arquivo

Arquivo: feature_extraction_new_3.py Projeto: yluibe/Ali2015-MobileRecommendation

class Feature(object):
    def __init__(self, item_dict='../data/itemdict'):
        """
        Class initialization.

        Parameters
        ----------
        funs_list : functions list. This list should contains three child list.
                    The first child list contains the user level feature Extraction
                    functions. The second contains the cat level functions.
                    The third contains the item level functions.

        """
        self.item_geos = None
        self.cat_geos = None
        self.read_geos(item_dict)

    def read_geos(self, item_dict):
        """
        Read geohash for items and corresponding categories, and the results are 
        stored in two dictionaries self.item_geos and self.cat_geos.
        The stucture of the self.item_geos and self.cat_geos:
            The key of the dict is the iid (item id) or cid (category id), and 
            the value is a list of all geohash corresponding to the specific id.

        """
        if self.item_geos is None:
            self.item_geos = {}
            self.cat_geos = {}
            _, P_item_id, P_item_geo, P_item_cat = data_utils.load_P_item(
                item_dict)
            self.P_item_id = P_item_id
            P_item_id_unique = np.unique(P_item_id).tolist()
            P_item_id_unique = dict((el, 0) for el in P_item_id_unique)
            self.P_item_id_unique = P_item_id_unique
            self.P_item_cat = P_item_cat
            P_item_cat_unique = np.unique(P_item_cat).tolist()
            P_item_cat_unique = dict((el, 0) for el in P_item_cat_unique)
            self.P_item_cat_unique = P_item_cat_unique
            for i in xrange(P_item_geo.shape[0]):
                if P_item_geo[i] != '':
                    if P_item_id[i] in self.item_geos:
                        self.item_geos[P_item_id[i]].append(P_item_geo[i])
                    else:
                        self.item_geos[P_item_id[i]] = [P_item_geo[i]]
                    if P_item_cat[i] in self.cat_geos:
                        self.cat_geos[P_item_cat[i]].append(P_item_geo[i])
                    else:
                        self.cat_geos[P_item_cat[i]] = [P_item_geo[i]]

    def new_divide(self, a, b):
        re = -1
        try:
            re = float(a) / float(b)
        except:
            if a == 0:
                re = 0
        return re

    def geo_dist(self, geo1, geo2, normalize=False):
        """
        Calculate the distance between two geohashes.

        Parameters
        ----------
        geo1 : geohashes 1, string.
        geo2 : geohashes 2, string.
        The two input geohashes should have the same length, otherwise there 
        will be an error.

        Returns
        -------
        The number of the same prefix of two geohashes. The higher, the closer.

        Examples
        --------
        >>> a = feature_extraction.geo_dist('9adsf12', '9a3241s')
        2

        """
        l1 = len(geo1)
        l2 = len(geo2)
        if l1 != l2:
            #print geo1
            #print geo2
            raise AttributeError('Wrong geos. Geo1: %s, geo2: %s' %
                                 (geo1, geo2))
        same = 0
        for i in xrange(l1):
            if geo1[i] != geo2[i]:
                break
            else:
                same += 1
        if normalize:
            """
            1 5009.4km x 4992.6km
            2 1252.3km x 624.1km
            3 156.5km x 156km
            4 39.1km x 19.5km
            5 4.9km x 4.9km
            6 1.2km x 0.61km
            7 152.9m x 152.4m
            """
            if same == 0 or same == 1:
                pass
            elif same == 2:
                same *= 32
            elif same == 3:
                same *= 1024.5
            elif same == 4:
                same *= 32802
            elif same == 5:
                same *= 1041646.4
            elif same == 6:
                same *= 34166571.6
            elif same == 7:
                same *= 1073297286.6
        return same

    def extract_features_2_2(self, data, time_thresh=None):
        """
        USER level 2.

        Parameters
        ----------
        data : data dict. Structure: {cid: {iid: [[behavior_type, user_geohash, time]]}}
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D #numpy# feature vector. Features:
            total number of different categories
            total number of different items
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            average number of behaviours 1
            average number of behaviours 2
            average number of behaviours 3
            average number of behaviours 4
            4/1
            4/2
            4/3
            total number of cats with 2
            total number of cats with 3
            total number of cats with 4
            total number of items with 2  #---------------- need to be added
            total number of items with 3  #---------------- need to be added
            total number of items with 4  #---------------- need to be added
            number of cats with (4/3/2/1) / number of cats without (4/3/2/1)
            number of items with (4/3/2/1) / number of items without (4/3/2/1)
            #log of above

        dict {cid:[1-D feature vector], ...}
            #average number of behaviours 1 except cid
            #average number of behaviours 2 except cid
            #average number of behaviours 3 except cid
            number of behaviours 4 except cid
            average number of behaviours 4 except cid
            #4/1 except cid
            #4/2 except cid
            #4/3 except cid
            
        """
        #dim = 6
        #dim = 14

        #feature = np.zeros(dim, dtype=np.float)
        #feature = []

        #id = 0

        #feature[id:id+2] = self.extract_features_2(data, time_thresh)
        #id += 2
        #feature += self.extract_features_2(data, time_thresh)
        #feature.append(len(data))
        c_b = {
            cid: [0.0, 0.0, 0.0, 0.0]
            for cid in data if cid > 0 and cid in self.P_item_cat_unique
        }

        num_items = 0.0
        b_1 = 0.0
        b_2 = 0.0
        b_3 = 0.0
        b_4 = 0.0
        # b_1_e = 0.0
        # b_2_e = 0.0
        # b_3_e = 0.0
        # b_4_e = 0.0
        c_1 = 0.0
        c_2 = 0.0
        c_3 = 0.0
        c_4 = 0.0
        c_i1 = 0.0
        c_i2 = 0.0
        c_i3 = 0.0
        c_i4 = 0.0
        for cid in data:
            if cid > 0:
                temp_c_1 = 0
                temp_c_2 = 0
                temp_c_3 = 0
                temp_c_4 = 0
                for iid in data[cid]:
                    if iid > 0:
                        temp_c_i1 = 0
                        temp_c_i2 = 0
                        temp_c_i3 = 0
                        temp_c_i4 = 0
                        num_items += 1
                        for entry in data[cid][iid]:
                            if entry[0] == 1:
                                b_1 += 1
                                temp_c_1 = 1
                                temp_c_i1 = 1
                                # if cid != CID:
                                #     b_1_e += 1
                                if cid in self.P_item_cat_unique:
                                    c_b[cid][0] += 1
                            elif entry[0] == 2:
                                b_2 += 1
                                temp_c_2 = 1
                                temp_c_i2 = 1
                                # if cid != CID:
                                #     b_2_e += 1
                                if cid in self.P_item_cat_unique:
                                    c_b[cid][1] += 1
                            elif entry[0] == 3:
                                b_3 += 1
                                temp_c_3 = 1
                                temp_c_i3 = 1
                                # if cid != CID:
                                #     b_3_e += 1
                                if cid in self.P_item_cat_unique:
                                    c_b[cid][2] += 1
                            elif entry[0] == 4:
                                b_4 += 1
                                temp_c_4 = 1
                                temp_c_i4 = 1
                                # if cid != CID:
                                #     b_4_e += 1
                                if cid in self.P_item_cat_unique:
                                    c_b[cid][3] += 1
                        c_i1 += temp_c_i1
                        c_i2 += temp_c_i2
                        c_i3 += temp_c_i3
                        c_i4 += temp_c_i4
                c_1 += temp_c_1
                c_2 += temp_c_2
                c_3 += temp_c_3
                c_4 += temp_c_4
        # feature[id] = num_items
        # id += 1
        # feature[id] = b_1
        # id += 1
        # feature[id] = b_2
        # id += 1
        # feature[id] = b_3
        # id += 1
        # feature[id] = b_4
        # id += 1

        feature = [
            len(data), num_items, b_1, b_2, b_3, b_4, b_1 / (len(data)),
            b_2 / (len(data)), b_3 / (len(data)), b_4 / (len(data))
        ]  #, b_1_e/(len(data)-1), b_2_e/(len(data)-1), b_3_e/(len(data)-1), b_4_e/(len(data)-1)]
        feature += [
            self.new_divide(b_4, b_3),
            self.new_divide(b_4, b_2),
            self.new_divide(b_4, b_1)
        ]
        #feature += [b_4_e/b_3_e, b_4_e/b_2_e, b_4_e/b_1_e]
        feature += [c_2, c_3, c_4]
        feature += [c_i2, c_i3, c_i4]
        feature += [
            self.new_divide(c_1,
                            len(data) - c_1),
            self.new_divide(c_2,
                            len(data) - c_2),
            self.new_divide(c_3,
                            len(data) - c_3),
            self.new_divide(c_4,
                            len(data) - c_4)
        ]
        feature += [
            self.new_divide(c_i1, num_items - c_i1),
            self.new_divide(c_i2, num_items - c_i2),
            self.new_divide(c_i3, num_items - c_i3),
            self.new_divide(c_i4, num_items - c_i4)
        ]

        # feature[id:] = np.log(feature[0:id] + 2)
        #logable = [True for i in feature]

        c_feat = {cid: 0 for cid in c_b}
        for cid in c_b:
            c_feat[cid] = [
                c_b[cid][3],
                self.new_divide(c_b[cid][3],
                                len(data) - 1)
            ]
            #c_feat[cid] = [self.new_divide(c_b[cid][0], len(data)-1), self.new_divide(c_b[cid][1], len(data)-1), self.new_divide(c_b[cid][2], len(data)-1), self.new_divide(c_b[cid][3], len(data)-1), self.new_divide(c_b[cid][3], c_b[cid][0]), self.new_divide(c_b[cid][3], c_b[cid][1]), self.new_divide(c_b[cid][3], c_b[cid][2])]

        return feature, c_feat

    def extract_features_3(self, data, time_thresh=None, time_start=0):
        """
        CAT level.
        Get feature vector from one user's behaviours of one category.
        #The structure of 'data' is a dict {item_id: [behavior_type, user_geohash, time]}

        Parameters
        ----------
        data : data dict. Structure: {iid: [[behavior_type, user_geohash, time]]}
        time_thresh : the time of the day to be predicted to day `2014-11-18 00`.
                        e.g., if one needs to predict the sale data in day `2014-12-18 00`,
                        then this time_thresh shoule be set to 
                        feature_extraction.duration_hours('2014-12-18 00')

        Returns
        -------
        1-D numpy feature vector. This function extracts the following features:
            total number of different items
            total number of behaviours 1
            total number of behaviours 2
            total number of behaviours 3
            total number of behaviours 4
            4/3
            4/2
            4/1
            \ mean 4/i for all items
            \ median 4/i for all items
            #\ max 4/i for all items
            #\ min 4/i for all items
            count of items with 2
            count of items with 3
            count of items with 4
            number of items with (4/3/2/1) / number of items without (4/3/2/1)
            \ mean number of behaviours i for all item
            \ mean number of behaviours i for the items with i
            \ median number of behaviours i for the items with i
            \ max number of behaviours i for the items with i #------------------------- useful???
            #\ min number of behaviours i for the items with i
            \ mean time between last behaviour to pred day for all item
            \ max time between last behaviour to pred day for all item #--------------------- min???
            \ min time between last behaviour to pred day for all item
            \ time between last behaviour 1 to pred day
            \ time between last behaviour 2 to pred day
            \ time between last behaviour 3 to pred day
            \ time between last behaviour 4 to pred day
            \ mean overall duration
            \ median overall duration
            \ max overall duration
            #\ min overall duration
            #\ min time between last behaviour to pred day for all item
            #total number of items
            #log(t+1) for all above

            Total dimensions: 2 * (1 + 4 + 3 + 12 + 3)

        dict {iid: [1-D feature vector], ...}
            count of items with behaviours 2 except iid
            count of items with behaviours 3 except iid
            count of items with behaviours 4 except iid
            average number of behaviours 1 except iid
            average number of behaviours 2 except iid
            average number of behaviours 3 except iid
            average number of behaviours 4 except iid
            min time between last behaviour 1 to pred day except iid
            min time between last behaviour 2 to pred day except iid
            min time between last behaviour 3 to pred day except iid
            min time between last behaviour 4 to pred day except iid
            min time between last behaviour to pred day except iid
            #4/1 except cid
            #4/2 except cid
            #4/3 except cid

        """
        if time_thresh is None:
            time_thresh = self.TIME_THRESH
        raw_data = []
        iid_dict = {}
        for iid in data:
            if iid > 0:
                temp = [[iid] + entry for entry in data[iid]]
                raw_data += temp
                iid_dict[iid] = [0.0, 0.0, 0.0, 0.0]
                for entry in data[iid]:
                    if entry[0] == 1:
                        iid_dict[iid][0] += 1
                    elif entry[0] == 2:
                        iid_dict[iid][1] += 1
                    elif entry[0] == 3:
                        iid_dict[iid][2] += 1
                    elif entry[0] == 4:
                        iid_dict[iid][3] += 1
        data = raw_data

        #dim = 46

        #feature = np.zeros(dim, dtype=np.float)
        #feature = [0 for i in xrange(dim)]
        feature = []
        item_ids = np.array([data[i][0] for i in xrange(len(data))])
        behaviors = np.array([data[i][1] for i in xrange(len(data))])
        times = np.array([data[i][3] for i in xrange(len(data))])

        b_1 = find_1D(behaviors == 1)
        b_2 = find_1D(behaviors == 2)
        b_3 = find_1D(behaviors == 3)
        b_4 = find_1D(behaviors == 4)

        item_unique = np.unique(item_ids)

        #feature[0] = item_unique.shape[0]
        feature.append(item_unique.shape[0])

        # feature[1] = b_1.shape[0]
        # feature[2] = b_2.shape[0]
        # feature[3] = b_3.shape[0]
        # feature[4] = b_4.shape[0]
        feature += [b_1.shape[0], b_2.shape[0], b_3.shape[0], b_4.shape[0]]
        feature += [
            self.new_divide(b_4.shape[0], b_1.shape[0]),
            self.new_divide(b_4.shape[0], b_2.shape[0]),
            self.new_divide(b_4.shape[0], b_3.shape[0])
        ]

        iid_ratio = np.array([[
            self.new_divide(entry[3], entry[0]),
            self.new_divide(entry[3], entry[1]),
            self.new_divide(entry[3], entry[2])
        ] for entry in iid_dict.values() if entry[3] != 0])
        if iid_ratio.shape == (0L, ):
            iid_ratio = np.array([[0.0, 0.0, 0.0]])
        feature += np.mean(iid_ratio, axis=0).tolist()
        feature += np.median(iid_ratio, axis=0).tolist()
        #feature += np.max(iid_ratio, axis=0).tolist()
        #feature += np.min(iid_ratio, axis=0).tolist()

        #feature += [float(b_1.shape[0])/feature[0], float(b_2.shape[0])/feature[0], float(b_3.shape[0])/feature[0], float(b_4.shape[0])/feature[0]
        #feature += [float(b_1_e.shape[0])/feature[0], float(b_2_e.shape[0])/feature[0], float(b_3_e.shape[0])/feature[0], float(b_4_e.shape[0])/feature[0]

        #feature += [np.unique(item_ids_except[b_2_e]).shape[0]]

        # feature[5] = feature[0] - np.unique(item_ids[b_2]).shape[0]
        # feature[6] = feature[0] - np.unique(item_ids[b_3]).shape[0]
        # feature[7] = feature[0] - np.unique(item_ids[b_4]).shape[0]

        feature += [
            np.unique(item_ids[b_2]).shape[0],
            np.unique(item_ids[b_3]).shape[0],
            np.unique(item_ids[b_4]).shape[0]
        ]
        feature += [
            self.new_divide(
                np.unique(item_ids[b_1]).shape[0],
                item_unique.shape[0] - np.unique(item_ids[b_1]).shape[0]),
            self.new_divide(
                np.unique(item_ids[b_2]).shape[0],
                item_unique.shape[0] - np.unique(item_ids[b_2]).shape[0]),
            self.new_divide(
                np.unique(item_ids[b_3]).shape[0],
                item_unique.shape[0] - np.unique(item_ids[b_3]).shape[0]),
            self.new_divide(
                np.unique(item_ids[b_4]).shape[0],
                item_unique.shape[0] - np.unique(item_ids[b_4]).shape[0])
        ]

        _, counts_1 = np.unique(item_ids[b_1], return_counts=True)
        _, counts_2 = np.unique(item_ids[b_2], return_counts=True)
        _, counts_3 = np.unique(item_ids[b_3], return_counts=True)
        _, counts_4 = np.unique(item_ids[b_4], return_counts=True)

        if counts_1.size == 0:
            counts_1 = np.zeros(1)
        if counts_2.size == 0:
            counts_2 = np.zeros(1)
        if counts_3.size == 0:
            counts_3 = np.zeros(1)
        if counts_4.size == 0:
            counts_4 = np.zeros(1)

        # feature[8] = np.sum(counts_1) / (feature[0] + 0.0)
        # feature[9] = np.sum(counts_2) / (feature[0] + 0.0)
        # feature[10] = np.sum(counts_3) / (feature[0] + 0.0)
        # feature[11] = np.sum(counts_4) / (feature[0] + 0.0)

        feature += [
            self.new_divide(np.sum(counts_1), feature[0]),
            self.new_divide(np.sum(counts_2), feature[0]),
            self.new_divide(np.sum(counts_3), feature[0]),
            self.new_divide(np.sum(counts_4), feature[0])
        ]

        feature += [
            counts_1.mean(),
            counts_2.mean(),
            counts_3.mean(),
            counts_4.mean()
        ]
        feature += [
            np.median(counts_1),
            np.median(counts_2),
            np.median(counts_3),
            np.median(counts_4)
        ]
        feature += [
            np.max(counts_1),
            np.max(counts_2),
            np.max(counts_3),
            np.max(counts_4)
        ]

        # feature[12] = counts_1.max()
        # feature[13] = counts_2.max()
        # feature[14] = counts_3.max()
        # feature[15] = counts_4.max()

        #feature += [counts_1.max(), counts_2.max(), counts_3.max(), counts_4.max()]

        # feature[16] = counts_1.min()
        # feature[17] = counts_2.min()
        # feature[18] = counts_3.min()
        # feature[19] = counts_4.min()

        #feature += [counts_1.min(), counts_2.min(), counts_3.min(), counts_4.min()]

        last_b = [
            time_thresh - max([times[i] for i in find_1D(item_ids == item)])
            for item in item_unique
        ]
        last_b_1 = time_thresh - max([time_start] + [times[i] for i in b_1])
        last_b_2 = time_thresh - max([time_start] + [times[i] for i in b_2])
        last_b_3 = time_thresh - max([time_start] + [times[i] for i in b_3])
        last_b_4 = time_thresh - max([time_start] + [times[i] for i in b_4])
        time_dict = {
            item: [
                time_thresh -
                max([time_start] +
                    [times[i] for i in find_1D(item_ids == item) if i in b_1]),
                time_thresh -
                max([time_start] +
                    [times[i] for i in find_1D(item_ids == item) if i in b_2]),
                time_thresh -
                max([time_start] +
                    [times[i] for i in find_1D(item_ids == item) if i in b_3]),
                time_thresh -
                max([time_start] +
                    [times[i] for i in find_1D(item_ids == item) if i in b_4])
            ]
            for item in item_unique
        }
        time_dict_values = np.array(time_dict.values(), dtype=np.float)
        time_dict_keys = time_dict.keys()
        duration_all = np.array([
            max([times[i] for i in find_1D(item_ids == item)]) -
            min([times[i] for i in find_1D(item_ids == item)])
            for item in item_unique
        ])

        # feature[20] = sum(last_b) / len(last_b)
        # feature[21] = max(last_b)
        # feature[22] = min(last_b)

        #feature += [sum(last_b) / len(last_b), max(last_b), min(last_b)]
        feature += [
            self.new_divide(sum(last_b), len(last_b)),
            max(last_b),
            min(last_b)
        ]
        feature += [last_b_1, last_b_2, last_b_3, last_b_4]
        feature += [
            np.mean(duration_all),
            np.median(duration_all),
            np.max(duration_all)
        ]

        #feature[23] = item_ids.shape[0];

        #feature[23:] = np.log(feature[0:23] + 2)

        #global id
        #id += 1
        #if id%100000 == 0:
        #    print id

        i_feat = {
            iid: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
            for iid in iid_dict if iid in self.P_item_id_unique
        }
        counts = np.array([
            iid_dict[iid] for iid in iid_dict if iid in self.P_item_id_unique
        ])
        counts_cp = np.copy(counts)
        counts_cp = np.sum(counts_cp, axis=0)
        counts[counts > 1] = 1
        counts = np.sum(counts, axis=0)
        for iid in i_feat:
            i_feat[iid][3] = self.new_divide(counts_cp[0] - iid_dict[iid][0],
                                             item_unique.shape[0] - 1)
            i_feat[iid][4] = self.new_divide(counts_cp[1] - iid_dict[iid][1],
                                             item_unique.shape[0] - 1)
            i_feat[iid][5] = self.new_divide(counts_cp[2] - iid_dict[iid][2],
                                             item_unique.shape[0] - 1)
            i_feat[iid][6] = self.new_divide(counts_cp[3] - iid_dict[iid][3],
                                             item_unique.shape[0] - 1)
            if iid_dict[iid][1] > 0:
                i_feat[iid][0] = counts[1] - 1
            else:
                i_feat[iid][0] = counts[1]
            if iid_dict[iid][2] > 0:
                i_feat[iid][1] = counts[2] - 1
            else:
                i_feat[iid][1] = counts[2]
            if iid_dict[iid][3] > 0:
                i_feat[iid][2] = counts[3] - 1
            else:
                i_feat[iid][2] = counts[3]
            temp_keys = [i for i in xrange(len(time_dict_keys))]
            del temp_keys[time_dict_keys.index(iid)]
            temp_values = time_dict_values[temp_keys, :]
            if temp_values.shape == (0L, ) or temp_values.shape == (
                    1L, 0L) or temp_values.shape[0] == 0:
                i_feat[iid] += [
                    time_thresh - time_start, time_thresh - time_start,
                    time_thresh - time_start, time_thresh - time_start,
                    time_thresh - time_start
                ]
            else:
                #print temp_values
                #print temp_values.shape
                temp_values = np.min(temp_values, axis=0)
                i_feat[iid] += temp_values.tolist()
                i_feat[iid].append(np.min(temp_values))