def refreshCoeffTableAndFollowing(self): if self.train_fname != self.prev_train_fname: # read an appropriate coefficient file coeff_fname = os.path.join(self.coefficient_dir, self.train_fname + '_coeff.json') tables = json.loads(open(coeff_fname).read()) pw = tables['pws'] pwf = tables['pwfs'] self.coeff_table = CoeffTable(tables = (pw, pwf)) self.prev_train_fname = self.train_fname # read an appropriate following file f_fname = os.path.join(self.following_dir, self.train_fname) followings = {} # we will fill out this dict for line in open(f_fname): user_id, f_list = line.rstrip('\n').split('\t') f_list = f_list.split(' ') followings[user_id] = f_list self.followings = followings
import yaml import json from passWeightCoeff import CoeffTable fin_name = '../data/semi/coefficient/train0_coeff.json' tables = json.loads(open(fin_name).read()) pw = tables['pws'] pwf = tables['pwfs'] print 'loaded' coeff = CoeffTable(tables = (pw, pwf)) print 'start cal' coeff.lookUp()
class ReweightingTextFeatureConverter(TextFeatureConverter): ''' The attributes that should be set by the caller: 1. train_fname (instance variable) 2. hard_label_fname (class variable) 3. coefficient_dir (class variable) 4. following data dir (class variable) ''' def __init__(self, getText = None, stopfile = None): ''' Need to read in hard-label list, so we pass those users later. Also we need to read-in the coefficient file. ''' super(ReweightingTextFeatureConverter, self).__init__(getText, stopfile) # if any self.hard_label_fname = ReweightingTextFeatureConverter.hard_label_fname self.coefficient_dir = ReweightingTextFeatureConverter.coefficient_dir self.following_dir = ReweightingTextFeatureConverter.following_dir # this is used to cache things self.prev_train_fname = None # read in all the hard labeled users hard_label_list = [] for line in open(self.hard_label_fname): user_id = line.split('\t')[0] hard_label_list.append(user_id) self.hard_label_list = hard_label_list def refreshCoeffTableAndFollowing(self): if self.train_fname != self.prev_train_fname: # read an appropriate coefficient file coeff_fname = os.path.join(self.coefficient_dir, self.train_fname + '_coeff.json') tables = json.loads(open(coeff_fname).read()) pw = tables['pws'] pwf = tables['pwfs'] self.coeff_table = CoeffTable(tables = (pw, pwf)) self.prev_train_fname = self.train_fname # read an appropriate following file f_fname = os.path.join(self.following_dir, self.train_fname) followings = {} # we will fill out this dict for line in open(f_fname): user_id, f_list = line.rstrip('\n').split('\t') f_list = f_list.split(' ') followings[user_id] = f_list self.followings = followings def additionalPass(self, words_arr, user_id, semi_label): ''' Override superclass method to add another pass. this pass will adjust the feature weight, based on the precomoputed p(w) and p(w|f) values for each class in the observed clean dataset. First, we should be able to distinguish if user_id is weakly labeled data, or hard labeled data which can be done easily by reading hard-label file. *The caller of this class is responsible to set the proper training file name*, so this method can know which file to read to calculate the coefficient. ''' if user_id in self.hard_label_list: # if clean data return words_arr # do nothing. # refresh coeff table and following list if needed self.refreshCoeffTableAndFollowing() # pre-load some common variables following = self.followings[user_id] pw_table = self.coeff_table.pwTable(int(semi_label)) pwf_table = self.coeff_table.pwfTable(int(semi_label)) # Now start to process each word in words_arr def _reweight(word_weight): word, weight = word_weight # load numerator, i.e., p(w) if not word in pw_table: return (word, weight) pw = pw_table[word] # numerator, or p(w) # load denominator, i.e., p(w|f). This part is more involved since # we should calculate the noisy-or. # for any entry available in the pwf_table, we combine the # coefficient. mul_1_q = 1 # PI_i {1 - q_i} for f in following: try: value = pwf_table[f][word] # p(w|f) mul_1_q *= (1 - value) / (1 - pw) except KeyError: continue # final constant if mul_1_q == 1: c_wf = 1 else: c_wf = min(pw / (1 - (1 - pw) * mul_1_q), 2.0) return (word, c_wf * weight) words_arr = map(_reweight, words_arr) return words_arr