def build(self,skip_cdiscount=False): prices={} l=[] spam_reader = parser(self.path) spam_reader.next() self.reset_count(self.train_len) print "computing prices dictionary and prices list" for row in spam_reader: if float(row[self.price_position])<=0: continue price = self.transform(float(row[self.price_position])) if smart_in(prices,price): if smart_in(prices[price],row[self.c3_position]): prices[price][row[self.c3_position]] += 1 else: prices[price][row[self.c3_position]] = 1 prices[price]['total']+=1 else: l.append(price) prices[price] = {row[self.c3_position] : 1,'total' : 1} self.smart_count() if self.loop_break: break l.sort() self.prices = prices self.p_list = l self.build_max_prices()
def build(self): brands = {} spam_reader = parser(self.path) print "computing brands dictionary" self.reset_count(self.train_len) spam_reader.next() for row in spam_reader: self.smart_count() brand = self.normalized_brand(row[self.brand_position]) if self.skip_cdiscount_function(row): continue if smart_in(brands, brand): if smart_in(brands[brand], row[self.c3_position]): brands[brand][row[self.c3_position]] += 1 else: brands[brand][row[self.c3_position]] = 1 brands[brand]["total"] += 1 else: brands[brand] = {row[self.c3_position]: 1, "total": 1} if self.loop_break: break self.brands = brands self.build_max_brands()
def build(self, skip_cdiscount=False): prices = {} l = [] spam_reader = parser(self.path) spam_reader.next() self.reset_count(self.train_len) print "computing prices dictionary and prices list" for row in spam_reader: if float(row[self.price_position]) <= 0: continue price = self.transform(float(row[self.price_position])) if smart_in(prices, price): if smart_in(prices[price], row[self.c3_position]): prices[price][row[self.c3_position]] += 1 else: prices[price][row[self.c3_position]] = 1 prices[price]['total'] += 1 else: l.append(price) prices[price] = {row[self.c3_position]: 1, 'total': 1} self.smart_count() if self.loop_break: break l.sort() self.prices = prices self.p_list = l self.build_max_prices()
def build(self): brands = {} spam_reader = parser(self.path) print "computing brands dictionary" self.reset_count(self.train_len) spam_reader.next() for row in spam_reader: self.smart_count() brand = self.normalized_brand(row[self.brand_position]) if self.skip_cdiscount_function(row): continue if smart_in(brands, brand): if smart_in(brands[brand], row[self.c3_position]): brands[brand][row[self.c3_position]] += 1 else: brands[brand][row[self.c3_position]] = 1 brands[brand]['total'] += 1 else: brands[brand] = {row[self.c3_position]: 1, 'total': 1} if self.loop_break: break self.brands = brands self.build_max_brands()
def compute_category(self,item): t = Timer() t.pick("debut") voc = self.voc_from_item(item) if self.product: voc = self.word_product(voc) voc_dic = self.word_dic_from_list(voc) best_score = 0 best_cat = 1000009411 t.pick("vocabulaire construit") cat_set = set() for word in voc_dic["dic"].keys(): try: cat_set = cat_set.union(self.word_cats_dict[word]) #print 'word : "%s", count : %s ' % (word,len(self.word_cats_dict[word])) except KeyError: #Si on ne peut pas trouver de catégorie pour ce mot, c'est qu'il n'a jamais été trouvé # dans le set train pass #print "total : %s " % (len(cat_set) ,) #print len(voc_dic["dic"]) for cat in cat_set: score = 0 for word in voc_dic["dic"].keys(): if smart_in(self.centroids[cat],word): score += self.centroids[cat][word]*voc_dic["dic"][word] if score > best_score: best_score = score best_cat = cat if best_score == 0: #print "nothing found" pass t.pick("best_cat chope") return best_cat
def compute_category(self, item): b = self.model_brand p = self.model_price if self.train: brand_position = self.brand_position price_position = self.price_position else: brand_position = self.brand_position_test price_position = self.price_position_test no_brand = NO_BRAND if not smart_in(b.brands, item[brand_position]): brand = no_brand else: brand = item[brand_position] price = float(item[price_position]) if price < 0: cat = b.cat_from_brand(brand) else: price = p.transform(price) prix = None prix = find_nearest(p.p_list, price) price = prix if b.proba[brand]['proba'] > p.proba[price][ 'proba'] and brand != no_brand: cat = b.cat_from_brand(brand) else: cat = p.cat_from_price(price) return cat
def compute_category(self,item): b=self.model_brand p=self.model_price if self.train: brand_position = self.brand_position price_position = self.price_position else: brand_position = self.brand_position_test price_position = self.price_position_test no_brand = NO_BRAND if not smart_in(b.brands,item[brand_position]): brand = no_brand else: brand = item[brand_position] price = float(item[price_position]) if price<0: cat=b.cat_from_brand(brand) else: price = p.transform(price) prix = None prix = find_nearest(p.p_list,price) price=prix if b.proba[brand]['proba']>p.proba[price]['proba'] and brand!=no_brand: cat=b.cat_from_brand(brand) else: cat=p.cat_from_price(price) return cat
def word_dic_from_list(self,word_list): dic = {"dic":{}, "total" : 0 } for word in word_list: if smart_in(dic["dic"],word): dic["dic"][word] += 1 dic["total"] += 1 else: dic["dic"][word] = 1 dic["total"] += 1 return dic
def compute_category(self, item): # Core function, associating an item with a category # item is a vector just read from the file if self.train: brand_position = self.brand_position else: brand_position = self.brand_position_test no_brand = NO_BRAND if not smart_in(self.brands, item[brand_position]): brand = no_brand else: brand = item[brand_position] cat = self.cat_from_brand(brand) return cat
def compute_category(self, item): #Core function, associating an item with a category #item is a vector just read from the file if self.train: brand_position = self.brand_position else: brand_position = self.brand_position_test no_brand = NO_BRAND if not smart_in(self.brands, item[brand_position]): brand = no_brand else: brand = item[brand_position] cat = self.cat_from_brand(brand) return cat
def build(self): spam_reader = parser(self.path) data = {} print "computing brands dictionary" self.reset_count(self.train_len) spam_reader.next() for row in spam_reader: self.smart_count() desc = row[self.desc_position] if self.skip_cdiscount_function(row): continue if smart_in(data,row[self.c3_position]): data[row[self.c3_position]].append(desc) else: data[row[self.c3_position]] = [desc] if self.loop_break: break final_data = {} print len(data["1000015309"]) for cat_id in data.keys(): final_text = '' for text in data[cat_id]: final_text += text final_data[cat_id] = final_text print final_text vectorizer = TfidfVectorizer() vectorizer.fit_transform(map(lambda x : final_data[x], final_data.keys())) return vectorizer
def add_word(self, word, n=1): if smart_in(self.dic, word): self.dic[word] += n else: self.dic[word] = n self.total += n
def build(self): self.word_cats_dict = {} cats = {} cat_count = {} spam_reader = parser(self.path) print "computing category dictionary" self.reset_count(self.train_len) spam_reader.next() for row in spam_reader: cat = row[self.c3_position] if self.skip_cdiscount_function(row): continue if self.skip_book_function(row): continue if self.cat_count is not None: if not smart_in(cat_count,cat): cat_count[cat] = 1 else: if cat_count[cat] > self.cat_count: continue else: cat_count[cat] += 1 self.smart_count() if self.loop_break: break voc = self.voc_from_item(row,train=True) for word in voc: if smart_in(self.word_cats_dict,word): if cat not in self.word_cats_dict[word]: self.word_cats_dict[word].append(cat) else: self.word_cats_dict[word] = [cat] if self.product: voc = self.word_product(voc) for word in voc: if smart_in(cats,cat): cats[cat].add_word(word) else: cats[cat] = WordDic() cats[cat].add_word(word) del(voc) print len(self.word_cats_dict) new_dict = {} for word in self.word_cats_dict: if not len(self.word_cats_dict[word])==1: new_dict[word] = self.word_cats_dict[word] else: if cats[next(iter(self.word_cats_dict[word]))].del_word(word): new_dict[word] = self.word_cats_dict[word] self.word_cats_dict = new_dict print len(self.word_cats_dict) print "size of word_cats : %s " % (sys.getsizeof(new_dict)*sys.getsizeof(new_dict[next(iter(new_dict))])) print "size of cats : %s " % (sys.getsizeof(cats)*sys.getsizeof(next(iter(cats)))) self.cats = cats self.build_centroids()
def add_word(self,word,n=1): if smart_in(self.dic,word): self.dic[word] += n else: self.dic[word] = n self.total += n