def test(input_path, output_path, character_list_path, spell2character, datanum): word2character = load_word2character(character_list_path) dataset = load_dataset(input_path) with open(input_path + '/testfile/test.txt', 'w') as fr: for x in dataset: print >> fr, x.encode('GBK') with open(input_path + '/testfile/test_pinyin.txt', 'w') as frpinyin: with open(input_path + '/testfile/test.txt', 'w') as fr: for i in range(0, 1000): x = dataset[i] pinyin = u'' sent = u'' for j in x: character = word2character.setdefault(j, None) if character: pinyin = pinyin + character + ' ' sent = sent + j print >> frpinyin, pinyin.encode('GBK') print >> fr, sent.encode('GBK') for i in range(30, 31): unit_lambda = i start_time = time.time() transfer(input_path=input_path + '/testfile/test_pinyin.txt', output_path=output_path, spell2character=spell2character, datanum=datanum, unit_lambda=unit_lambda) num_same = 0 num_all = 0 with open(input_path + '/testfile/test.txt', 'r') as fri: with open(output_path, 'r') as fro: while 1: line1 = fri.readline() line2 = fro.readline() if not line1: break if not line2: break line1 = line1.decode('GBK') line2 = line2.decode('GBK') line1.strip('\n') line2.strip('\n') if len(line1) != len(line2): # print('Error') line1 = fri.readline() else: num_all = num_all + len(line1) for i in range(len(line1)): if (line1[i] == line2[i]): num_same = num_same + 1 print('Unit_lambda:%02d' % unit_lambda) print('\tAccuracy rate: %02.3f' % (float(num_same) / float(num_all))) print("\tTook: %4.4fs" % (time.time() - start_time)) return num_same / num_all
def build(self): print "start build BaseEnrollmentFeature..." fs = {} enrollment = Enrollment("../data/merge/enrollment.csv") course_statistic_time = CourseStatisticTimeInfo() course_statistic_time.load() course_time_sequence_info = CourseTimeSequenceInfo() course_time_sequence_info.load() for id in enrollment.ids: username, course_id = enrollment.enrollment_info.get(id) course_id_vec = [0] * COURSE_VEC_NUM course_id_vec[course_statistic_time.get_course_id(course_id)] = 1 whole_site_before_course_ids_vec = [0] * COURSE_VEC_NUM whole_site_after_course_ids_vec = [0] * COURSE_VEC_NUM before_course_num, after_course_num = course_time_sequence_info.get_course_num_before_after( username, id) before_course_ids, after_course_ids = course_time_sequence_info.get_course_ids_before_after( username, id) for k in before_course_ids: _username, k = enrollment.enrollment_info.get(k) whole_site_before_course_ids_vec[ course_statistic_time.get_course_id(k)] = 1 for k in after_course_ids: _username, k = enrollment.enrollment_info.get(k) whole_site_after_course_ids_vec[ course_statistic_time.get_course_id(k)] = 1 whole_site_before_course_num_vec = get_vector( before_course_num, MAX_ENROLLMENT_VEC_NUM) whole_site_after_course_num_vec = get_vector( after_course_num, MAX_ENROLLMENT_VEC_NUM) enrollment_num = before_course_num + after_course_num enrollment_num_vec = get_vector(enrollment_num, MAX_ENROLLMENT_VEC_NUM) user_num = len(enrollment.course_info.get(course_id, [])) just_num_vec = [ user_num, enrollment_num, before_course_num, after_course_num ] fv = [ course_id_vec, whole_site_before_course_ids_vec, whole_site_after_course_ids_vec, whole_site_before_course_num_vec, whole_site_after_course_num_vec, enrollment_num_vec, just_num_vec ] f = [] for arr in fv: f.append(",".join(["%s" % transfer(k) for k in arr])) fs[id] = ",".join(["%s" % k for k in f]) writepickle(BaseEnrollmentFeature.feature_filename, fs) print "build BaseEnrollmentFeature over"
def build(self): print "start build BaseEnrollmentFeature..." fs = {} enrollment = Enrollment("../data/merge/enrollment.csv") course_statistic_time = CourseStatisticTimeInfo() course_statistic_time.load() course_time_sequence_info = CourseTimeSequenceInfo() course_time_sequence_info.load() for id in enrollment.ids: username, course_id = enrollment.enrollment_info.get(id) course_id_vec = [0] * COURSE_VEC_NUM course_id_vec[course_statistic_time.get_course_id(course_id)] = 1 whole_site_before_course_ids_vec = [0] * COURSE_VEC_NUM whole_site_after_course_ids_vec = [0] * COURSE_VEC_NUM before_course_num, after_course_num = course_time_sequence_info.get_course_num_before_after(username,id) before_course_ids, after_course_ids = course_time_sequence_info.get_course_ids_before_after(username,id) for k in before_course_ids: _username, k = enrollment.enrollment_info.get(k) whole_site_before_course_ids_vec[course_statistic_time.get_course_id(k)] = 1 for k in after_course_ids: _username, k = enrollment.enrollment_info.get(k) whole_site_after_course_ids_vec[course_statistic_time.get_course_id(k)] = 1 whole_site_before_course_num_vec = get_vector(before_course_num, MAX_ENROLLMENT_VEC_NUM) whole_site_after_course_num_vec = get_vector(after_course_num, MAX_ENROLLMENT_VEC_NUM) enrollment_num = before_course_num + after_course_num enrollment_num_vec = get_vector(enrollment_num, MAX_ENROLLMENT_VEC_NUM) user_num = len(enrollment.course_info.get(course_id, [])) just_num_vec = [user_num, enrollment_num, before_course_num, after_course_num] fv = [course_id_vec, whole_site_before_course_ids_vec, whole_site_after_course_ids_vec, whole_site_before_course_num_vec, whole_site_after_course_num_vec, enrollment_num_vec, just_num_vec] f = [] for arr in fv: f.append(",".join(["%s" % transfer(k) for k in arr])) fs[id] = ",".join(["%s" % k for k in f]) writepickle(BaseEnrollmentFeature.feature_filename, fs) print "build BaseEnrollmentFeature over"
return num_same / num_all if __name__ == '__main__': # train # train(conf=conf) # load spell2character = [] words, spell2word, characters, spell2character = load_from_saved( path=conf.save_path) if len(sys.argv) == 1: input_path = conf.input_path output_path = conf.output_path else: input_path = sys.argv[1] output_path = sys.argv[2] transfer(input_path=input_path, output_path=output_path, spell2character=spell2character, datanum=conf.data_num, unit_lambda=conf.unit_lambda) # test # test(input_path=conf.test_path, output_path=conf.output_path, character_list_path=conf.character_list_path, # spell2character=spell2character, datanum=conf.data_num) # print('Success!') os.system("pause")
def build(self): print "start build DayLevelInfo..." coursetimeinfo = CourseTimeInfo() log = Log("../data/merge/log.csv") enrollment = Enrollment("../data/merge/enrollment.csv") obj = Obj() label = Label() userinfo = Userinfo() userinfo.load() module = Module() module.load() transfer_day = Transfer() transfer_day.load() ccc = 0 fs = {} for id in enrollment.ids: ccc += 1 if ccc % 5000 == 0: print ccc infos = log.enrollment_loginfo.get(id, []) username, course_id = enrollment.enrollment_info.get(id) #time,source,event,o #source: browser,server #event:access,discussion,nagivate,page_close,problem,video,wiki #category:video,vertical,static_tab,sequential,problem,peergrading,outlink,html,discussion,dictation,course_info,course,combinedopenended,chapter,about #time:2014-06-13T09:52:49 info_by_day = {} for info in infos: if info[0].find("T") < 0: continue day,timehms = info[0].split("T") #weight = module.get_weight(info[3]) weight = 1 if day not in info_by_day: info_by_day[day] = {} info_by_day[day]["event"] = {} if info[1] == "browser": info_by_day[day]["browser"] = info_by_day[day].get("browser", 0) + weight else: info_by_day[day]["server"] = info_by_day[day].get("server", 0) + weight event_idx = get_event_idx(info[2]) info_by_day[day]["event"][event_idx] = info_by_day[day]["event"].get(event_idx, 0) + weight weekday = week.get(day) info_by_day[day]["weekday"] = weekday hour = int(timehms[:2]) / 2 info_by_day[day]["hour"] = hour cidx = obj.get_index(course_id, week.times(info[0])) info_by_day[day]["cidx"] = cidx day_event_count = [0] * EVENT_VEC_NUM day_weekday_count = [0] * WEEKDAY_VEC_NUM day_hour_count = [0] * HOUR_VEC_NUM day_cidx_count = [0] * CIDX_VEC_NUM _browser = 0 _server = 0 for (day, info) in info_by_day.items(): for (k,v) in info["event"].items(): day_event_count[k] = day_event_count[k] + math.sqrt(v) _browser = _browser + math.sqrt(info.get("browser", 0)) _server = _server + math.sqrt(info.get("server", 0)) day_weekday_count[info["weekday"]] = day_weekday_count[info["weekday"]] + 1 day_hour_count[info["hour"]] = day_hour_count[info["hour"]] + 1 day_cidx_count[info["cidx"]] = day_cidx_count[info["cidx"]] + 1 f = [0] * (2 + EVENT_VEC_NUM + WEEKDAY_VEC_NUM + HOUR_VEC_NUM + CIDX_VEC_NUM) f[0] = transfer(_browser) f[1] = transfer(_server) fv_no_transfer = [day_event_count,day_weekday_count,day_hour_count,day_cidx_count] start = 2 for vs in fv_no_transfer: for (i, v) in enumerate(vs): f[start+i] = transfer(v) start = start + len(vs) fs[id] = ",".join(["%.2f" % k for k in f ]) print start modelFileSave = open('_feature/day.level.info.model', 'wb') pickle.dump(fs, modelFileSave) modelFileSave.close() print "build DayLevelInfo over!"
def build(self): print "start build BaseTimeFeature..." fs = {} enrollment = Enrollment("../data/merge/enrollment.csv") lastdayinfo = LastDayInfo() lastdayinfo.load_id_days() user_statistic_info = UserStatisticInfo() user_statistic_info.load() coursetimeinfo = CourseStatisticTimeInfo() coursetimeinfo.load() transfer_day = Transfer() transfer_day.load() for id in enrollment.ids: username, course_id = enrollment.enrollment_info.get(id) days = lastdayinfo.get_days(id) # day gap info gap_day_vec = [0] * MAX_GAP_DAY_VEC_NUM gap_lastday_vec = [0] * MAX_GAP_DAY_VEC_NUM for i in range(len(days) - 1): gap_day = TimeUtil.diff(days[i + 1], days[i]) add_vector_value(gap_day_vec, gap_day) gap_lastday = TimeUtil.diff(days[-1], days[i]) / 2 + 1 add_vector_value(gap_lastday_vec, gap_lastday) alldays = user_statistic_info.get_unique_days(username) before_lastday_day_num = 0 after_lastday_day_num = 0 after_lastday_day_vec = [0] * MAX_GAP_DAY_VEC_NUM hold_day_in_enrollment = 0 hold_day_in_site = 0 hold_day_after_in_site = 0 if len(days) > 0: lastday = days[-1] hold_day_in_enrollment = TimeUtil.diff(lastday, days[0]) hold_day_in_site = TimeUtil.diff(lastday, sorted(alldays)[0]) hold_day_after_in_site = TimeUtil.diff( sorted(alldays)[-1], lastday) for day in alldays: diff = TimeUtil.diff(day, lastday) / 2 if diff > 0: after_lastday_day_num = after_lastday_day_num + 1 add_vector_value(after_lastday_day_vec, diff) else: before_lastday_day_num = before_lastday_day_num + 1 lastday_after_num_in_maxgap = sum(after_lastday_day_vec[:-1]) hold_day_in_enrollment_vec = [0] * 12 hold_day_in_site_vec = [0] * 12 hold_day_after_in_site_vec = [0] * 12 hold_day_in_enrollment_idx = get_gap_idx(hold_day_in_enrollment) hold_day_in_site_idx = get_gap_idx(hold_day_in_site) hold_day_after_in_site_idx = get_gap_idx(hold_day_after_in_site) hold_day_in_enrollment_vec[hold_day_in_enrollment_idx] = 1 hold_day_in_site_vec[hold_day_in_site_idx] = 1 hold_day_after_in_site_vec[hold_day_after_in_site_idx] = 1 lastday_before_num_vec = get_vector(before_lastday_day_num, DAYS_VEC_NUM) lastday_after_num_vec = get_vector(after_lastday_day_num, DAYS_VEC_NUM) lastday_after_num_in_maxgap_vec = get_vector( lastday_after_num_in_maxgap, DAYS_VEC_NUM) day_len_vec = get_vector(len(days), DAYS_VEC_NUM) alldays_len_vec = get_vector(len(alldays), DAYS_VEC_NUM) day_len_half_vec = get_vector((len(days) + 1) / 2, DAYS_VEC_NUM / 2) alldays_len_half_vec = get_vector((len(alldays) + 1) / 2, DAYS_VEC_NUM / 2) if len(days) > 0: transfer_vec = transfer_day.get_features(days[-1]) else: transfer_vec = transfer_day.get_features("") just_num_vec = [ before_lastday_day_num, after_lastday_day_num, lastday_after_num_in_maxgap, len(days), len(alldays) ] fv = [ gap_day_vec, gap_lastday_vec, after_lastday_day_vec, hold_day_in_enrollment_vec, hold_day_in_site_vec, hold_day_after_in_site_vec, hold_day_in_enrollment_vec, hold_day_in_site_vec, hold_day_after_in_site_vec, lastday_before_num_vec, lastday_after_num_vec, lastday_after_num_in_maxgap_vec, day_len_vec, alldays_len_vec, day_len_half_vec, alldays_len_half_vec, transfer_vec, just_num_vec ] f = [] for arr in fv: f.append(",".join(["%s" % transfer(k) for k in arr])) fs[id] = ",".join(["%s" % k for k in f]) writepickle(BaseTimeFeature.feature_filename, fs) print "build BaseTimeFeature over"
def get_features(self, infos, course_id, isDebug = False): week = Week() event_count = [0] * EVENT_VEC_NUM category_count = [0] * CATEGORY_VEC_NUM u_event_count = [0] * EVENT_VEC_NUM uday_event_count = [0] * EVENT_VEC_NUM event_timesum = [0] * EVENT_VEC_NUM event_sqrt_timesum = [0] * EVENT_VEC_NUM weekday_count = [0] * WEEKDAY_VEC_NUM hour_count = [0] * HOUR_VEC_NUM cidx_count = [0] * CIDX_VEC_NUM cidx_by_stat_count = [0] * CIDX_VEC_NUM month_count = [0] * MONTH_VEC_NUM next_public = [0] * CIDX_VEC_NUM dx_by_date_count = [0] * 30 spend_time_count = [0] * 10 sqrt_spend_time_count = [0] * 10 browser = 0 server = 0 timesum = 0 sqrt_timesum = 0 _p = 0 cc = 0 _event_idx = -1 u_event_dict = {} for info in infos: if info[0].find("T") < 0: continue p = week.times(info[0]) _time = 1 _sqrt_time = 1 if _p > p - 60 * 3: _time = p - _p _sqrt_time = math.sqrt(p - _p) else: cc += 1 timesum = timesum + _time sqrt_timesum = sqrt_timesum + _sqrt_time if _event_idx != -1: event_timesum[_event_idx] = event_timesum[_event_idx] + _time/10.0 event_sqrt_timesum[_event_idx] = event_sqrt_timesum[_event_idx] + _sqrt_time/10.0 if isDebug: print timesum ,p,_p,info[0] _p = p day,timehms = info[0].split("T") if info[1] == "browser": browser += 1 else: server += 1 year,month,d = day.split("-") month_idx = (int(month) - 1) * 2 + int(d)/16 month_count[month_idx] = 1 event_idx = get_event_idx(info[2]) _event_idx = event_idx category_idx = self.obj.get_category_idx(info[3]) category_count[category_idx] = category_count[category_idx] + 1 if info[2]+info[-1] not in u_event_dict: u_event_count[event_idx] = u_event_count[event_idx] + 1 if day+info[2]+info[-1] not in u_event_dict: uday_event_count[event_idx] = uday_event_count[event_idx] + 1 u_event_dict[info[2]+info[-1]] = 1 event_count[event_idx] = event_count[event_idx] + 1 weekday = week.get(day) weekday_count[weekday] = weekday_count[weekday] + 1 hour = int(timehms[:2]) / 2 hour_count[hour] = hour_count[hour] + 1 cidx = self.obj.get_index(course_id, week.times(info[0])) cidx_count[cidx] = cidx_count[cidx] + 1 cidx_by_stat = self.coursetimeinfo.get_index(course_id, week.times(info[0])) cidx_by_stat_count[cidx_by_stat] = cidx_by_stat_count[cidx_by_stat] + 1 dx_by_date = self.cdate.get_index(course_id, day) dx_by_date_count[dx_by_date] = dx_by_date_count[dx_by_date] + 1 if _p > 1: next_public_diff = self.obj.get_index(course_id, _p) else: next_public_diff = CIDX_VEC_NUM if next_public_diff > CIDX_VEC_NUM - 1: next_public_diff = CIDX_VEC_NUM - 1 next_public[next_public_diff] = 1 time_idx = self.get_spend_time_idx(timesum) spend_time_count[time_idx] = 1 time_idx = self.get_spend_time_idx(sqrt_timesum) sqrt_spend_time_count[time_idx] = 1 buf = [] objw = self.objweight.get_features(infos) info_vec = [0] * INFO_VEC_NUM k = len(infos) if k > 3: k = int(math.sqrt(k - 3)) + 3 if k > INFO_VEC_NUM-1: k = INFO_VEC_NUM-1 info_vec[k] = 1 fp = [cc, len(infos), browser, server, timesum/60.0, sqrt_timesum/60.0] buf.append( "%.3f" % ((browser+3.1)/(float(len(infos))+6.5))) if isDebug: print fp event_isclick_count = [0] * (EVENT_VEC_NUM+1) k = sum([1 for i in event_count if i > 0]) #event_isclick_count[k] = 1 #print k fv = [event_count,weekday_count,hour_count,cidx_count,cidx_by_stat_count, month_count, spend_time_count, sqrt_spend_time_count, fp, next_public, event_sqrt_timesum, event_timesum, u_event_count, objw, event_isclick_count, uday_event_count, dx_by_date_count] fv_debug = ["event_count","weekday_count","hour_count","cidx_count","cidx_by_stat_count", "month_count", "spend_time_count", "sqrt_spend_time_count", "fp", "next_public", "event_sqrt_timesum", "event_timesum", "u_event_count", "objw", "event_isclick_count", "uday_event_count", "dx_by_date_count"] for j in range(len(fv)): vs = fv[j] if isDebug: print fv_debug[j],vs for (i, v) in enumerate(vs): buf.append( "%.3f" % transfer(v)) return ",".join(buf)
def get_features_no_courseid(self, infos, isDebug = False): week = Week() event_count = [0] * EVENT_VEC_NUM weekday_count = [0] * WEEKDAY_VEC_NUM hour_count = [0] * HOUR_VEC_NUM month_count = [0] * MONTH_VEC_NUM spend_time_count = [0] * 10 sqrt_spend_time_count = [0] * 10 browser = 0 server = 0 timesum = 0 sqrt_timesum = 0 _p = 0 cc = 0 for info in infos: if info[0].find("T") < 0: continue p = week.times(info[0]) if _p > p - 60 * 3 and _p <= p: timesum = timesum + p - _p sqrt_timesum = sqrt_timesum + math.sqrt(p - _p) else: cc += 1 timesum = timesum + 1 sqrt_timesum = sqrt_timesum + 1 if isDebug: print timesum ,p,_p,info[0] _p = p day,timehms = info[0].split("T") if info[1] == "browser": browser += 1 else: server += 1 year,month,d = day.split("-") month_idx = (int(month) - 1) * 2 + int(d)/16 month_count[month_idx] = 1 event_idx = get_event_idx(info[2]) event_count[event_idx] = event_count[event_idx] + 1 weekday = week.get(day) weekday_count[weekday] = weekday_count[weekday] + 1 hour = int(timehms[:2]) / 2 hour_count[hour] = hour_count[hour] + 1 time_idx = self.get_spend_time_idx(timesum) spend_time_count[time_idx] = 1 time_idx = self.get_spend_time_idx(sqrt_timesum) sqrt_spend_time_count[time_idx] = 1 info_vec = [0] * INFO_VEC_NUM k = len(infos) if k > 5: k = int(math.sqrt(k - 5 )) + 5 if k > INFO_VEC_NUM-1: k = INFO_VEC_NUM-1 info_vec[k] = 1 buf = [] fp = [cc, len(infos), browser, server, timesum/60.0, sqrt_timesum/60.0] buf.append( "%.3f" % ((browser+3.1)/(float(len(infos))+6.5))) if isDebug: print fp fv = [event_count,weekday_count,hour_count, month_count, spend_time_count, sqrt_spend_time_count, fp] fv_debug = ["event_count","weekday_count","hour_count", "month_count", "spend_time_count", "sqrt_spend_time_count", "fp"] for j in range(len(fv)): vs = fv[j] if isDebug: print fv_debug[j],vs for (i, v) in enumerate(vs): buf.append( "%.3f" % transfer(v)) return ",".join(buf)
def get_features(id,IS_DEBUG=False): y = label.get(id) username, course_id = enrollment.enrollment_info.get(id) course_id_vec = [0] * COURSE_VEC_NUM course_id_vec[coursetimeinfo.get_course_id(course_id)] = 1 days = lastdayinfo.get_days(id) is_last_vec = [0] * IS_LAST_VEC_NUM #0 not last, 1-5, 6 more than 5 is_pre_vec = [0] * IS_LAST_VEC_NUM #0 not last, 1-5, 6 more than 5 is_next_vec = [0] * IS_LAST_VEC_NUM #0 not last, 1-5, 6 more than 5 next_daynum_vec = [0] * IS_LAST_VEC_NUM gap_vec = [0] * IS_LAST_VEC_NUM if len(days) < 2: is_last_vec[0] = 1 is_pre_vec[0] = 1 else: last_day = days[-1] isCC = 0 _diff = 100 for i in range(len(days)-1): diff = TimeUtil.diff(days[i+1], days[i]) if diff > (IS_LAST_VEC_NUM-1): diff = (IS_LAST_VEC_NUM-1) gap_vec[diff] = gap_vec[diff] + 1 for day in days[:-1]: diff = TimeUtil.diff(last_day, day) / 2 + 1 if diff < (IS_LAST_VEC_NUM-1): is_last_vec[diff] = 1 isCC = isCC + 1 if diff < _diff: _diff = diff if isCC == 0: is_last_vec[IS_LAST_VEC_NUM-1] = 1 _diff = IS_LAST_VEC_NUM-1 is_pre_vec[_diff] = 1 alldays = userinfo.get_days(username) daynum = 0 whole_site_pre_vec = [0] * IS_LAST_VEC_NUM whole_site_post_vec = [0] * IS_LAST_VEC_NUM whole_site_pre_enrollment_vec = [0] * IS_LAST_VEC_NUM whole_site_post_enrollment_vec = [0] * IS_LAST_VEC_NUM whole_site_pre_enrollment_id_vec = [0] * COURSE_VEC_NUM whole_site_post_enrollment_id_vec = [0] * COURSE_VEC_NUM k1,k2 = moreinfo.get_enrollment_features(username,id) kids1, kids2 = moreinfo.get_enrollment_ids(username,id) for k in kids1: _username, k = enrollment.enrollment_info.get(k) whole_site_pre_enrollment_id_vec[coursetimeinfo.get_course_id(k)] = 1 for k in kids2: _username, k = enrollment.enrollment_info.get(k) whole_site_post_enrollment_id_vec[coursetimeinfo.get_course_id(k)] = 1 if k1 > IS_LAST_VEC_NUM - 1: k1 = IS_LAST_VEC_NUM - 1 if k2 > IS_LAST_VEC_NUM - 1: k2 = IS_LAST_VEC_NUM - 1 #print k1, k2, kids1, kids2 whole_site_pre_enrollment_vec[k1] = 1 whole_site_post_enrollment_vec[k2] = 1 pre_num = 0 post_num = 0 lastday = "" hold_day_in_enrollment = 0 if len(days) > 0: lastday = days[-1] hold_day_in_enrollment = TimeUtil.diff(lastday, days[0]) for day in alldays: diff = TimeUtil.diff(day,days[-1]) / 2 if diff > 0: post_num = post_num + 1 else: pre_num = pre_num + 1 if diff > 0 and diff < IS_LAST_VEC_NUM-1: is_next_vec[diff] = 1 daynum += 1 if daynum >= IS_LAST_VEC_NUM: daynum = IS_LAST_VEC_NUM - 1 else: print id,"X" hold_day_in_site = 0 hold_day_after_in_site = 0 if len(alldays) > 1 and len(lastday) > 1: hold_day_in_site = TimeUtil.diff(lastday, sorted(alldays)[0]) hold_day_after_in_site = TimeUtil.diff(sorted(alldays)[-1],lastday) hold_day_in_enrollment_vec = [0] * 12 hold_day_in_site_vec = [0] * 12 hold_day_after_in_site_vec = [0] * 12 hold_day_in_enrollment_idx = get_gap_idx(hold_day_in_enrollment) hold_day_in_site_idx = get_gap_idx(hold_day_in_site) hold_day_after_in_site_idx = get_gap_idx(hold_day_after_in_site) hold_day_in_enrollment_vec[hold_day_in_enrollment_idx] = 1 hold_day_in_site_vec[hold_day_in_site_idx] = 1 hold_day_after_in_site_vec[hold_day_after_in_site_idx] = 1 #print username,hold_day_in_enrollment,hold_day_in_site,hold_day_after_in_site if daynum == 0: is_next_vec[IS_LAST_VEC_NUM-1] = 1 if post_num > IS_LAST_VEC_NUM-1: post_num = IS_LAST_VEC_NUM-1 if pre_num > IS_LAST_VEC_NUM-1: pre_num = IS_LAST_VEC_NUM-1 if IS_DEBUG: print "post_num,",post_num,"pre_num",pre_num whole_site_pre_vec[pre_num] = 1 whole_site_post_vec[post_num] = 1 next_daynum_vec[daynum] = 1 use_vec = userinfo.get_features(username, course_id) if len(days) > 0: transfer_vec = transfer_day.get_features(days[-1]) else: transfer_vec = transfer_day.get_features("") enr_ids = enrollment.user_enrollment_id.get(username, []) enrollment_num = len(enr_ids) non_unique_days = [] for _id in enr_ids: _days = lastdayinfo.get_days(_id) non_unique_days = non_unique_days + _days f_last_day = lastdayfeature.get_features(id) f_day_level = daylevel.get_features(id) f_common = alldayfeature.get_features(id) f_user_site = wholesitefeature.get_features(username) _lasthour = lastdayinfo.get_lasthour(id) f_cor,nodropdays = cor.get_features(id) f_statistic = statistic.get_features(lastday, course_id, days, alldays, non_unique_days, _lasthour, y, nodropdays) f_statistic_start_idx = statistic.get_start_idx(lastday, course_id) f_dayindex = lastdayinfo.get_day_idx_features(non_unique_days) f_days = [0] * DAYS_VEC_NUM f_all_days = [0] * DAYS_VEC_NUM f_days_half = [0] * (DAYS_VEC_NUM/2) f_all_days_half = [0] * (DAYS_VEC_NUM/2) f_enrollment_num_vec = [0] * MAX_ENROLLMENT_VEC_NUM if enrollment_num > MAX_ENROLLMENT_VEC_NUM - 1: enrollment_num = MAX_ENROLLMENT_VEC_NUM - 1 f_enrollment_num_vec[enrollment_num] = 1 dy_num = len(days) if dy_num > DAYS_VEC_NUM-1: dy_num = DAYS_VEC_NUM-1 f_days[dy_num] = 1 if dy_num > 2: dy_num = min(dy_num, DAYS_VEC_NUM/2-1) f_days_half[dy_num] = 1 dy_num = len(alldays) if dy_num > DAYS_VEC_NUM-1: dy_num = DAYS_VEC_NUM-1 f_all_days[dy_num] = 1 if dy_num > 2: dy_num = min(dy_num, DAYS_VEC_NUM/2-1) f_all_days_half[dy_num] = 1 f = [0] * 348 f[0] = transfer(len(enrollment.course_info.get(course_id, []))) f[1] = transfer(math.sqrt(len(enrollment.course_info.get(course_id, [])))) f[2] = transfer(enrollment_num) f[3] = transfer(len(days)) f[4] = transfer(len(alldays)) fv_no_transfer = [transfer_vec] fv_no_transfer_debug = ["transfer_vec"] enrollment_vec = get_enrollment_features(lastday, enrollment, username, lastdayinfo, id) start = 5 for j in range(len(fv_no_transfer)): vs = fv_no_transfer[j] if IS_DEBUG: print fv_no_transfer_debug[j],vs for (i, v) in enumerate(vs): f[start+i] = v start = start + len(vs) #fv = [course_id_vec,is_last_vec, use_vec, is_next_vec,next_daynum_vec,is_pre_vec,f_days,f_all_days,whole_site_pre_vec,whole_site_post_vec,gap_vec, f_enrollment_num_vec, whole_site_pre_enrollment_vec, whole_site_post_enrollment_vec, whole_site_pre_enrollment_id_vec, whole_site_post_enrollment_id_vec, hold_day_in_enrollment_vec,hold_day_in_site_vec,hold_day_after_in_site_vec] fv = [course_id_vec,is_last_vec, use_vec, is_next_vec,next_daynum_vec,is_pre_vec,f_days,f_all_days,whole_site_pre_vec,whole_site_post_vec,gap_vec, f_enrollment_num_vec, whole_site_pre_enrollment_vec, whole_site_post_enrollment_vec, whole_site_pre_enrollment_id_vec, whole_site_post_enrollment_id_vec, enrollment_vec, f_days_half,f_all_days_half] fv_debug = ["course_id_vec","is_last_vec", "use_vec", "is_next_vec","next_daynum_vec","is_pre_vec","f_days","f_all_days","whole_site_pre_vec","whole_site_post_vec","gap_vec", "f_enrollment_num_vec", "whole_site_pre_enrollment_vec", "whole_site_post_enrollment_vec", "whole_site_pre_enrollment_id_vec", "whole_site_post_enrollment_id_vec", "hold_day_in_enrollment_vec","hold_day_in_site_vec","hold_day_after_in_site_vec", "enrollment_vec","f_days_half","f_all_days_half"] for j in range(len(fv)): vs = fv[j] if IS_DEBUG: print fv_debug[j],vs for (i, v) in enumerate(vs): f[start+i] = transfer(v) start = start + len(vs) if IS_DEBUG: print start f = ",".join(["%.2f" % k for k in f]) fs = "%s,%s,%s,%d,%d," \ "+%s,+%s,+%s,+%s,+%s," \ "+%s,+%s,+%s\n" % (y, id, course_id, len(days),f_statistic_start_idx, f_common, f_last_day, f_day_level, f_user_site, f, f_statistic, f_dayindex, f_cor) #fs = "%s,%s,%s,%d,%s\n" % (y, id, course_id, len(days), f_last_day ) return fs
#!/usr/bin/python from SynonymsTree import * import cPickle as pickle from transfer import * def transfer(l, coeff=0.5): print l print " -> Transfert possible ? " + str(transferConditions(arbre, l, coeff)) print l print "--------------------------------------------------------" myfile = open("phages.data", "rb") arbre = pickle.load(myfile) l1 = ["integrase", "integrase", "recombinase", "recombinase spe0", "hypothetical protein", "recombinase spe2", "integrase spe0"] l2 = ["integrase", "integrase", "recombinase", "recombinase spe0", "test2", "phage repressor protein", "hypothetical protein"] l3 = ["integrase", "integrase", "recombinase spe0", "phage repressor protein", "phage repressor protein"] l4 = ["integrase", "integrase", "recombinase", "hypothetical protein"] print arbre.displayTree() transfer(l1) transfer(l2, 0.8) transfer(l3) transfer(l4) myfile.close()
load_data.preprocess_fragment(cfrag)) train_step = train_step_factory(extractor, content_targets, style_targets, style_content_loss, content_layers, style_layers, style_weight=1000, content_weight=1 ) epochs = 7 np_data = transfer(transfer_chunk, train_step, epochs = epochs ) data_diffs = [] for j in range(epochs -1): npd = np_data[j+1] - np_data[j] data_diffs.append(np.abs(npd).sum().sum()) print(f"Absolute difference per epoch: {data_diffs}") transfer_frag = Fragment( generated_song, i, np_data = load_data.postprocess_fragment( tf.Variable(np_data[-1])) )
print "******************Executing in the same node",site,"***************" print "Parent cost",totalCostAtParent transfer(cur,db,site,None,cpuCost,diskCost,tables,totalCostAtParent) listOfQueries.append({"siteId":site, "cpuUtilization":cpuCost, "diskUtilization":diskCost, "parentId":None, "tables":tables, "time":int(totalCostAtParent/10)+15, "executionTime":int(totalCostAtParent/10)+1}) else: print "******************ALL ARE 100",site,"***************" listOfWaititngQueries.append({"tables":tables,"site":site,"isServed":0}) queueSize+= 1 ''' if cpuUsage+cpuCost < 100 and diskUsage+diskCost < 100: print "******************Executing in the same node",site,"***************" print "Parent cost",totalCostAtParent transfer(cur,db,site,None,cpuCost,diskCost,tables,totalCostAtParent) listOfQueries.append({"siteId":site, "cpuUtilization":cpuCost, "diskUtilization":diskCost, "parentId":None, "tables":tables, "time":int(totalCostAtParent/10)+15, "executionTime":int(totalCostAtParent/10)+1}) else: print "******************ALL ARE 100",site,"***************" listOfWaititngQueries.append({"tables":tables,"site":site,"isServed":0}) queueSize+= 1 f.close() q.close() c.close() m.close() cur.close() db.close()
childNode.append(node) flag = 0 ###################### when a child node with the data is found ############### for child in childNode: if flag == 0: cpuCost = (calculateCPUcost(cur,child,tables))/1000 diskCost =0# (calculateDiskCost(cur,child,tables))/100 waitingTime = calculateWaitingTime(cur,child) totalCostAtChild = cpuCost + diskCost + waitingTime cpuUsage = getCPUUtilization(cur,child) diskUsage = getDiskUtilization(cur,child) if totalCostAtParent > totalCostAtChild and cpuUsage+cpuCost < 100 and diskUsage+diskCost < 100: print "Parent cost",totalCostAtParent,"Child cost",totalCostAtChild print "************* executing at child",child,"***************" transfer(cur,db,child,site,cpuCost,diskCost,tables,totalCostAtChild) listOfQueries.append({"siteId":child, "cpuUtilization":cpuCost, "diskUtilization":diskCost, "parentId":site, "tables":tables, "time":int(totalCostAtChild/10)+15, "executionTime":int(totalCostAtChild/10)+1}) flag = 1 ###################### search the neighbouring node to execute the query ############### neighbourNode = findNeighbourNode(cur,site) for neighbour in neighbourNode: if flag == 0: cpuCost = (calculateCPUcost(cur,neighbour,tables))/1000 diskCost = 0# (calculateDiskCost(cur,neighbour,tables))/100 waitingTime = calculateWaitingTime(cur,neighbour) cpuUsage = getCPUUtilization(cur,neighbour) diskUsage = getDiskUtilization(cur,neighbour)
def main(argv): loginType = 0 accounts = [] transactions = [] deletions = [] withdrawals = {} if (not len(argv) == 2): return # Infinite loop for receiving input. while True: command = input() # Disallow any actions until login. if (not loginType and command != 'login'): print "Error, please login before entering other commands." # Disallow multiple logins. elif (loginType and command == 'login'): print "You are already logged in." # Execute login elif command == 'login': loginType = login(accounts, argv[0]) # Execute logout elif command == 'logout': loginType = logout(transactions, argv[1]) # Disallow create in retail mode elif loginType == 2 and command == 'create': print "Error, please use agent mode to create accounts." # Execute create elif command == 'create': create(accounts, deletions, transactions) # Disallow delete in retail mode elif loginType == 2 and command == 'delete': print "Error, please use agent mode to delete accounts." # Execute delete elif command == 'delete': delete(accounts, deletions, transactions) # Execute deposit elif command == 'deposit': deposit(loginType, accounts, deletions, transactions) # Execute withdraw elif command == 'withdraw': withdraw(loginType, accounts, deletions, transactions, withdrawals) # Execute transfer elif command == 'transfer': transfer(loginType, accounts, deletions, transactions) else: print "Error, command not recognized." if loginType == 0: withdrawals = {} deletions = []
def build(self): print "start build BaseTimeFeature..." fs = {} enrollment = Enrollment("../data/merge/enrollment.csv") lastdayinfo = LastDayInfo() lastdayinfo.load_id_days() user_statistic_info = UserStatisticInfo() user_statistic_info.load() coursetimeinfo = CourseStatisticTimeInfo() coursetimeinfo.load() transfer_day = Transfer() transfer_day.load() for id in enrollment.ids: username, course_id = enrollment.enrollment_info.get(id) days = lastdayinfo.get_days(id) # day gap info gap_day_vec = [0] * MAX_GAP_DAY_VEC_NUM gap_lastday_vec = [0] * MAX_GAP_DAY_VEC_NUM for i in range(len(days)-1): gap_day = TimeUtil.diff(days[i+1], days[i]) add_vector_value(gap_day_vec, gap_day) gap_lastday = TimeUtil.diff(days[-1], days[i]) / 2 + 1 add_vector_value(gap_lastday_vec, gap_lastday) alldays = user_statistic_info.get_unique_days(username) before_lastday_day_num = 0 after_lastday_day_num = 0 after_lastday_day_vec = [0] * MAX_GAP_DAY_VEC_NUM hold_day_in_enrollment = 0 hold_day_in_site = 0 hold_day_after_in_site = 0 if len(days) > 0: lastday = days[-1] hold_day_in_enrollment = TimeUtil.diff(lastday, days[0]) hold_day_in_site = TimeUtil.diff(lastday, sorted(alldays)[0]) hold_day_after_in_site = TimeUtil.diff(sorted(alldays)[-1],lastday) for day in alldays: diff = TimeUtil.diff(day, lastday) / 2 if diff > 0: after_lastday_day_num = after_lastday_day_num + 1 add_vector_value(after_lastday_day_vec, diff) else: before_lastday_day_num = before_lastday_day_num + 1 lastday_after_num_in_maxgap = sum(after_lastday_day_vec[:-1]) hold_day_in_enrollment_vec = [0] * 12 hold_day_in_site_vec = [0] * 12 hold_day_after_in_site_vec = [0] * 12 hold_day_in_enrollment_idx = get_gap_idx(hold_day_in_enrollment) hold_day_in_site_idx = get_gap_idx(hold_day_in_site) hold_day_after_in_site_idx = get_gap_idx(hold_day_after_in_site) hold_day_in_enrollment_vec[hold_day_in_enrollment_idx] = 1 hold_day_in_site_vec[hold_day_in_site_idx] = 1 hold_day_after_in_site_vec[hold_day_after_in_site_idx] = 1 lastday_before_num_vec = get_vector(before_lastday_day_num, DAYS_VEC_NUM) lastday_after_num_vec = get_vector(after_lastday_day_num, DAYS_VEC_NUM) lastday_after_num_in_maxgap_vec = get_vector(lastday_after_num_in_maxgap, DAYS_VEC_NUM) day_len_vec = get_vector(len(days), DAYS_VEC_NUM) alldays_len_vec = get_vector(len(alldays), DAYS_VEC_NUM) day_len_half_vec = get_vector((len(days)+1)/2, DAYS_VEC_NUM/2) alldays_len_half_vec = get_vector((len(alldays)+1)/2, DAYS_VEC_NUM/2) if len(days) > 0: transfer_vec = transfer_day.get_features(days[-1]) else: transfer_vec = transfer_day.get_features("") just_num_vec = [before_lastday_day_num, after_lastday_day_num, lastday_after_num_in_maxgap, len(days), len(alldays)] fv = [gap_day_vec, gap_lastday_vec, after_lastday_day_vec, hold_day_in_enrollment_vec, hold_day_in_site_vec, hold_day_after_in_site_vec, hold_day_in_enrollment_vec, hold_day_in_site_vec, hold_day_after_in_site_vec, lastday_before_num_vec, lastday_after_num_vec, lastday_after_num_in_maxgap_vec, day_len_vec, alldays_len_vec, day_len_half_vec, alldays_len_half_vec, transfer_vec, just_num_vec] f = [] for arr in fv: f.append(",".join(["%s" % transfer(k) for k in arr])) fs[id] = ",".join(["%s" % k for k in f]) writepickle(BaseTimeFeature.feature_filename, fs) print "build BaseTimeFeature over"
def build(self): print "start build DayLevelFeature..." log = LogInfo("../data/merge/log.csv") enrollment = Enrollment("../data/merge/enrollment.csv") obj = Obj() ccc = 0 fs = {} feature_num = 0 for id in enrollment.ids: ccc += 1 if ccc % 5000 == 0: print ccc infos = log.enrollment_loginfo.get(id, []) username, course_id = enrollment.enrollment_info.get(id) info_by_day = {} for info in infos: if info[0].find("T") < 0: continue day, timehms = info[0].split("T") weight = 1 if day not in info_by_day: info_by_day[day] = {} info_by_day[day]["event"] = {} if info[1] == "browser": info_by_day[day]["browser"] = info_by_day[day].get( "browser", 0) + weight else: info_by_day[day]["server"] = info_by_day[day].get( "server", 0) + weight event_idx = get_event_idx(info[2]) info_by_day[day][ "event"][event_idx] = info_by_day[day]["event"].get( event_idx, 0) + weight info_by_day[day]["dayofweek"] = TimeUtil.getDayWeek(day) hour = int(timehms[:2]) / 2 info_by_day[day]["hour"] = hour cidx = obj.get_index(course_id, TimeUtil.timestamp(info[0])) info_by_day[day]["cidx"] = cidx day_event_count = [0] * EVENT_VEC_NUM day_weekday_count = [0] * WEEKDAY_VEC_NUM day_hour_count = [0] * HOUR_VEC_NUM day_cidx_count = [0] * CIDX_VEC_NUM _browser = 0 _server = 0 for (day, info) in info_by_day.items(): for (k, v) in info["event"].items(): day_event_count[k] = day_event_count[k] + math.sqrt(v) _browser = _browser + math.sqrt(info.get("browser", 0)) _server = _server + math.sqrt(info.get("server", 0)) day_weekday_count[info["dayofweek"]] = day_weekday_count[ info["dayofweek"]] + 1 day_hour_count[info["hour"]] = day_hour_count[info["hour"]] + 1 day_cidx_count[info["cidx"]] = day_cidx_count[info["cidx"]] + 1 just_num_vec = [_browser, _server] fv = [ day_event_count, day_weekday_count, day_hour_count, day_cidx_count, just_num_vec ] f = [] feature_num = 0 for arr in fv: feature_num += len(arr) arr = ["%s" % transfer(k) for k in arr] f.append(",".join(arr)) fs[id] = ",".join(f) writepickle(DayLevelFeature.feature_filename, fs) print "build DayLevelFeature over!", feature_num
from index import * from transfer import * if __name__ == '__main__': originImagePath = '../images/filter/stars_200.jpg' targetImagePath = '../images/filter/golden_gate_bridge.jpg' start = time.clock() filterData, filterShape = loadImageData(targetImagePath, True, False) originData, originShape = loadImageData(originImagePath, True, False) filterRange = rangeMat(filterData) originRange = rangeMat(originData) filterData = normalization(filterData) originData = normalization(originData) originU, originV = loadUV('../tem/stars_200/mb7') targetU, targetV = loadUV('../tem/bridge_400/b7') transfer(originU, originV, originData, originRange, targetU, targetV, filterData, filterRange) print time.clock() - start start = time.clock() originU, originV = loadUV('../tem/stars_200/ma7') targetU, targetV = loadUV('../tem/bridge_400/a7') transfer(originU, originV, originData, originRange, targetU, targetV, filterData, filterRange) print time.clock() - start