def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(self_training_file_dir+'test_%s.data'%attribute,'w') for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) if len(sorted_feature)==0: continue fout.write('%d'%label) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def statistics(attribute, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) labels_distribute = [0., 0.] for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] if len(features) < 10: continue for f in features: if f in distribute: distribute[f][label] += 1 #features[f] labels_distribute[label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0] /= labels_distribute[0] distribute[f][1] /= labels_distribute[1] for f in distribute: s = sum(distribute[f]) distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute = sorted(distribute.items(), key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) / (sum(d[1]) + 0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) if len(sorted_feature)==0: continue fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) if len(sorted_feature)==0: continue fout.write('%d'%label) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) labels_distribute=[0.,0.] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] if len(features)<10: continue for f in features: if f in distribute: distribute[f][label]+=1#features[f] labels_distribute[label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0]/=labels_distribute[0] distribute[f][1]/=labels_distribute[1] for f in distribute: s=sum(distribute[f]) distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def construct_test_set(attribute): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print 'Balance params: ',balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue #if random.random()>balance_params[label]: # continue '============' x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) fout.write('%d %s\n'%(label,str_x)) uid_output.write('%s\n'%(user['_id'])) bar.draw(index+1)
def construct_test_set(attribute): product_features = get_features(feature_file=base_dir + '/features/product.feature') mention_features = get_features(feature_file=base_dir + '/features/mention.feature', existent_features=product_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=mention_features) mention_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=review_featuers) mention_features_2 = get_features(feature_file=base_dir + '/features/mention_2.feature', existent_features=mention_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print 'Balance params: ', balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue #if random.random()>balance_params[label]: # continue '============' x = [] #user['products']=[] for f, v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f], v)) #user['mentions_0']={} for f, v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f], v)) #user['review']=[] for f, v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f], v)) user['mentions_1'] = {} for f, v in user['mentions_1'].items(): f = f + '_1' if f not in mention_features_1: continue x.append((mention_features_1[f], v)) user['mentions_2'] = {} for f, v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f], v)) x = sorted(x, key=lambda d: d[0]) str_x = ' '.join(map(lambda f: '%s:%f' % f, x)) fout.write('%d %s\n' % (label, str_x)) uid_output.write('%s\n' % (user['_id'])) bar.draw(index + 1)