def main(): tag = nn.Dataframefactory("hcp_tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() wechat = nn.Dataframefactory("wechat", iotype=iotype) web = nn.Dataframefactory("web", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat, web) if cbindBehavData.shape[0] == 0: print("ERROR!!!") print("NO VALID DATA IS PREPARED! PLEASE CHECK THE RAW DATA.") print() else: doctorList = list(set(cbindBehavData["doctorid"])) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna( ).drop_duplicates().to_frame() contentLabeled = titleLabeling(contentTitle, mapping) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for docid in doctorList: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["doctorid"] == docid] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, docid) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, docid) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") nn.write_table(heatMapOutput, 'hcp_heatmap', iotype=iotype) # hcp_heatmap structure: four columns - doctorid, month_id, tag_name, tag_count nn.write_table(chordMapOutput, 'hcp_chordmap', iotype=iotype) # hcp_chordmap structure: four columns - doctorid, point_one, point_two, count return (1)
def load(): print("Designed for Novo4PE: Content Rec Sys") print("------------------------------------------------------") print("Step 1: loading necessary data") #behavior_raw = pd.read_csv( # '../essentials/itangyi_wechat_hcp_content_view') behavior_raw = nn.Dataframefactory('wechat', iotype=iotype) #content_lib_pat = pd.read_csv( # "../essentials/idiabetes_patient_history_article") global content_lib_pat content_lib_pat = nn.Dataframefactory('pat_articles', iotype=iotype) #content_lib_hcp = pd.read_csv( # "../essentials/idiabetes_hcp_history_article.csv") global content_lib_hcp content_lib_hcp = nn.Dataframefactory('hcp_articles', iotype=iotype) print("Step 1: Done") print("------------------------------------------------------") print("Step 2: Creating dictionary") create_dict_stop() print("Step 2: Done") print("------------------------------------------------------") print("Step 3: Processing Content_Lib and Behavior Data") global content_lib content_lib = content_lib_processing(content_lib_pat, content_lib_hcp) global behavior_all_indexed behavior_all_indexed = interaction_processing(behavior_raw) print("Step 3: Done") print("------------------------------------------------------") print("Step 4: Caculating Cotent Attribute") global corpus_list corpus_list = corpus_process(content_lib) gen_tfidf_matrix(corpus_list) global pat_tfidf_matrix, hcp_tfidf_matrix, pat_content_title, hcp_content_title pat_tfidf_matrix, hcp_tfidf_matrix, pat_content_title, hcp_content_title = split_tfidf_matrix( content_lib) global most_viewed most_viewed = get_ranked_content_title(behavior_all_indexed, content_lib) print("Step 4: Done") print("------------------------------------------------------") print("Step 5: Generating Personal Rec List") global uniq_usr uniq_usr = behavior_raw.hcp_openid_u_2.unique() # for usr_id in tqdm(uniq_usr): # rec = generate_usr_rec(usr_id, behavior_all_indexed, content_lib, pat_tfidf_matrix, # pat_content_title, hcp_tfidf_matrix, hcp_content_title, most_viewed) print("Step 5: Done") print("------------------------------------------------------") print("LOAD COMPLETE") return ('ALGORITHM LOADING COMPLETE')
def __init__(self): print("Preparing WeCall DataManager") self.__tokenizer = Seg() self.__tagger = TagSys() self.__behavior_raw = nn.Dataframefactory("behavior", iotype=iotype) self.__web_raw = nn.Dataframefactory("web", iotype=iotype) self.__wecall_content = nn.Dataframefactory("wecall_article", iotype=iotype) self.__wecall_doctor = nn.Dataframefactory("wecall_doctor", iotype=iotype) self.__wecall_sales = nn.Dataframefactory("wecall_sales", iotype=iotype) self.__novo_hcp_info = nn.Dataframefactory("novo_hcp", iotype=iotype) self.__novo_market_lv = nn.Dataframefactory("novo_hcp_market", iotype=iotype) self.__wecall_article_brand = nn.Dataframefactory("wecall_brand", iotype=iotype) self.__wecall_content_detail = nn.Dataframefactory("wecall_detail", iotype=iotype) self.__cooked_behavior, self.__wecall_bev = self.__behavior_data_process( ) self.__hcp_behavior = self.__make_hcp_behavior() self.__behavior_content_tag = self.__make_behavior_content_tag() self.__wecall_content_tag = self.__make_wecall_content_tag() self.__wecall_hcp_market_mapping = self.__make_wecall_market_mapping() self.__wecall_behavior = self.__make_wecall_behavior() self.__hcp_title_market_info = self.__make_hcp_market_title() self.__wecall_content_brand = self.__make_wecall_content_brand() self.__wecall_content_url = self.__make_wecall_content_url() print("WeCall DataManager's ready to serve")
def main(): tag = nn.Dataframefactory("pat_tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() wechat = nn.Dataframefactory("wechat", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat) patList = list(set(cbindBehavData["hcp_openid_u_2"])) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates( ).to_frame() contentLabeled = titleLabeling(contentTitle, mapping) print(contentLabeled) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for openID in patList: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["hcp_openid_u_2"] == openID] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, openID) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, openID) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") nn.write_table(heatMapOutput, 'pat_heatmap', iotype=iotype) # pat_heatmap structure: four columns - openID, month_id, tag_name, tag_count nn.write_table(chordMapOutput, 'pat_chordmap', iotype=iotype) # pat_chordmap structure: four columns - openID, point_one, point_two, count return (1)
def __init__(self, option="hcp"): self.similar_words = nn.Dataframefactory('similar', iotype=iotype) if option == "hcp": self.tag = nn.Dataframefactory('hcp_tag', iotype=iotype) self.lb = "医生标签" elif option == "pat": self.tag = nn.Dataframefactory('pat_tag', iotype=iotype) self.lb = "病人标签" else: print('option is only for pat or hcp') raise Exception self.mapping = self.__create_mapping() self.word_tag = self.__get_word_tag()
def main(): # pre-define path & variables corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url')) vector = "vectorizer.joblib" matrix = "tfidf.npy" outpath = nnenv.getResourcePath() # load dict and stopwords createDictStop() # load corpus/ corpus = combineTitleAndContent(corpus_raw) # save content_id mapping content_id_mapping = corpus[["content_id"]] content_id_mapping.index.name = 'index' content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) # transform corpus to right format corpus["corpus"] = corpus["all"].apply(segment) #create tfidf-matrix and vectorizer tfidfMatrix, vectorizer = createTfidfMatrix(corpus) #save esstenial files with open(outpath + vector, 'wb') as f: joblib.dump(vectorizer, f) np.save(outpath + matrix, tfidfMatrix) print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
def createDictStop(): """ Load external dictionary and stop words """ print("Loading Dictionary and Stopwords") global stopWord dic = nn.Dataframefactory('mappingword',sep = '/r/n',iotype=iotype) word = dic.word.tolist() stopWord = nn.Dataframefactory('stopword',sep = '/r/n',iotype=iotype) stopWord = stopWord.word.tolist() stopWord.append(" ") jieba.re_han_default = re.compile(r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/β/α/-]+)', re.UNICODE) frequnecy = 100000000000000000000000 # Add words to dictionary for words in word: jieba.add_word(words, freq=frequnecy) print("Finished Dic Loading")
def __load_dict(self): jieba.re_han_default = re.compile( r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/”/“/"/β/α/-]+)', re.UNICODE) dictionary = nn.Dataframefactory('mappingword', sep='\r\n', iotype=iotype) for x in dictionary.word: jieba.add_word(x, self.freq)
def loading_everything(): global tag, similar, mapping, clf, tfidf_matrix, labeled_corpus, title_list, content_id_mapping createDictStop() tag = nn.Dataframefactory('tag', iotype='fs') similar = nn.Dataframefactory('similar', iotype='fs') mapping = mappingCbind(similar, tag) clf = nn.Joblibfactory(nnenv.getItem('vectorizer')) tfidf_matrix = nn.Numpyarrayfactory(nnenv.getItem('tfidf')) labeled_corpus = nn.Dataframefactory('labeledContent', sep='|', iotype='fs', con=nnenv.getItem('mysql_url')) title_list = labeled_corpus.title.tolist() content_id_mapping = nn.Dataframefactory('content_id_mapping', iotype='fs')
def input_check(table): try: df = nn.Dataframefactory(table, iotype=iotype) if df.empty: raise ValueError("table {} is empty".format(table)) return df except Exception as e: print( "There was an error in your input table, Please double check your data source\n:{}" .format(e))
def createDictStop(): """ Load external dictionary and stop words """ print("Loading Dictionary and Stopwords") global stopWord dic = nn.Dataframefactory('mappingword',sep = '\r\n',iotype = iotype) #stopWord = pd.read_csv(dic_path + 'StopWordFinal.txt', # encoding='utf-8', engine='python', sep='\r\n') stopWord = nn.Dataframefactory('stopword',sep = '\r\n', iotype = iotype) word = dic.word.tolist() word = dic.word.tolist() stopWord = stopWord.word.tolist() jieba.re_han_default = re.compile( r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/β/α/-]+)', re.UNICODE) frequnecy = 1000000000000000000000 # Add words to dictionary for words in word: jieba.add_word(words, frequnecy) print("Dictionary and StopWord have been loaded")
def main(): raw = nn.Dataframefactory("pat_call_center", iotype=iotype) mapping = nn.Dataframefactory("pat_call_mapping", iotype=iotype) print("Begin aggregating patient questions") patQuesDf = sepQuestions(raw) print("Patient questions prepared") print("Begin calculating") quesMerge = pd.merge(patQuesDf, mapping, how="left", left_on="customer_question", right_on="question") output = quesMerge[[ "patient_id", "customer_question", "question_category", "question_sub_category", "product_type" ]] print("Finished calculating") nn.write_table(output, 'pat_call_center_stats', iotype=iotype) # pat_call_center_stats structure: five columns - patient_id, customer_question, question_category, question_sub_category, product_type return (1)
def create_dict_stop(): """ Load external dictionary and stop words """ print("Loading Dictionary and Stopwords") global stopWord dic_path = '../essentials/' #dic = pd.read_csv(dic_path + "mappingWordFinal.txt", # encoding='utf-8', engine='python', sep='\r\n') dic = nn.Dataframefactory("mappingword", sep='\r\n', iotype=iotype) word = dic.word.tolist() # #stopWord = pd.read_csv(dic_path + 'StopWordFinal.txt', # encoding='utf-8', engine='python', sep='\r\n') stopWord = nn.Dataframefactory("stopword", iotype=iotype) stopWord = stopWord.word.tolist() stopWord.append(" ") jieba.re_han_default = re.compile( r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/”/“/"/β/α/-]+)', re.UNICODE) frequnecy = 100000000000000000000000 # Add words to dictionary for words in word: jieba.add_word(words, freq=frequnecy) print("Finished Dic Loading")
def __load_stopword(self): stop = nn.Dataframefactory("stopword", sep='\r\n', iotype=iotype) self.stopwords = stop.word.tolist()
import nndw import nnenv df = nndw.Dataframefactory()
def main(): tag = nn.Dataframefactory("tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() novoHcpAgg = nn.Dataframefactory("hcp_ability_detailing", iotype=iotype) doctorList = list(set(novoHcpAgg["customer_code"])) wechat = nn.Dataframefactory("wechat", iotype=iotype) web = nn.Dataframefactory("web", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat, web, doctorList) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates( ).to_frame() contentLabeled = titleLabeling(contentTitle, mapping) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # segment mapping file, write this table to Hive allCbindDf = cbindAllConditions(novoHcpAgg) print("Created segment mapping file") # do lv2 tag stats and get the top 15 labels allLv2Stats = statsByLevel(validBehavDataLabelled, "lv2_tag") topLv2LabelsDf = getTopNLabels(allLv2Stats, 15) print("Found top 15 tags of all doctors") # for seg in all segments, for each month in all months in the segment # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for segId in allCbindDf["segment_id"]: segDocList = getSegDoctorList(allCbindDf, novoHcpAgg, segId) if len(segDocList) != 0: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["doctorid"].isin(segDocList)] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, segId, topLv2LabelsDf) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, segId) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") objNovoHcpAgg = novoHcpAgg.astype("object") mergeSegID = pd.merge( objNovoHcpAgg, allCbindDf, how="left", left_on=["detailing_path_id", "level", "academic_title", "department"], right_on=["detailing_path", "hcp_segment", "title", "department"]) customerCodeSegId = mergeSegID[["customer_code", "segment_id"]] nn.write_table(heatMapOutput, 'heatmap', iotype=iotype) nn.write_table(chordMapOutput, 'chordmap', iotype=iotype) nn.write_table(allCbindDf, 'segmentmapping', iotype=iotype) nn.write_table(customerCodeSegId, 'customerCodeSegId', iotype=iotype) return (1)
def main(): print("Designed for Novo4PE-Pilot") print("------------------------------------------------------") print("Step 1: loading necessary data") tag = nn.Dataframefactory('tag',iotype = iotype) #similar = pd.read_csv('./essential/tag_similar_words.csv') similar = nn.Dataframefactory('similar',iotype = iotype) mapping = mappingCbind(similar, tag) #wechat = pd.read_excel("./essential/wechat_mengbo.xlsx") wechat = nn.Dataframefactory('wechat',iotype = iotype) wecall = wechat[wechat.module_2.isin(["WeCall 2.0","WeCall 1.0"])] wecall_content = wecall.content_title.unique() #web = pd.read_excel("./essential/web_mengbo.xlsx") web = nn.Dataframefactory('web',iotype = iotype) #novo_hcp = pd.read_csv("./essential/novo_hcp") novo_hcp = nn.Dataframefactory('novo_hcp',iotype = iotype) #novo_market = pd.read_csv("./essential/novo_hcp_market") novo_market = nn.Dataframefactory('novo_hcp_market',iotype = iotype) article_url = nn.Dataframefactory('article_url',iotype = iotype) print("Step 1: Done") print("------------------------------------------------------") print("Step 2: Creating dictionary") createDictStop() print("Step 2: Done") print("------------------------------------------------------") print("Step 3: Processing Raw Data") wechatFilterd, webFilterd, validWechatLog, validWebLog, contentPrefData, LogData = dataPrepare( wechat, web) print("Step 3: Done") print("------------------------------------------------------") print("Step 4: Caculating Channel Preference") output1 = channelPref(wechatFilterd, webFilterd) print("Step 4: Done") cotentTitle = contentPrefData['content_title'].dropna( ).drop_duplicates().to_frame() contentLabeled = titleLabeling(cotentTitle, mapping) contentNew = contentPrefData.merge( contentLabeled, left_on='content_title', right_on='content_title') print("------------------------------------------------------") print("Step 5: Caculating HCP Content Preference and Interest Point") output2 = pd.DataFrame() output3 = pd.DataFrame() for dc_id in contentNew.doctorid.unique(): contentInsteret, otherTags, lb, labelMap = calContInst( contentNew, dc_id) keywordCnt = calContKeyWord( contentNew, dc_id, lb, otherTags, labelMap, mapping) output2 = output2.append(contentInsteret) output3 = output3.append(keywordCnt) output2.reset_index(drop=True, inplace=True) output3.reset_index(drop=True, inplace=True) print("Step 5: Done") print("------------------------------------------------------") print("Step 6: Caculating HCP reading History") webHistWithoutToken = webHistWithoutTokens(validWebLog) wechathistWithoutToken = wechatHistWithoutTokens(validWechatLog) output4 = readingHist(webHistWithoutToken, wechathistWithoutToken, contentLabeled) print("Step 6: Done") content_uq = get_content_uniq(LogData) hcp_reading_history = get_hcp_reading_history(LogData) doctorid_uq = get_uniq_doctorid(LogData) hcp_lb_uq = get_hcp_label_uniq(mapping) content_lb = contentLabeled[["content_title", "HCP标签"]] content_lb_pop = content_lb.merge( content_uq[["content_id", "content_title", "popularity"]], on="content_title") # 更新表结构 !!!HCP标签的 好计算 content_lb_pop[hcp_lb_uq] = content_lb_pop["HCP标签"].apply( create_var, args=(hcp_lb_uq,)) hcp_tech_class, hcp_info_pro = get_hcp_tech_class( novo_hcp, novo_market, doctorid_uq, hcp_reading_history) content_pop = content_lb_pop[["content_title", "popularity"]] hcp_class_mapping = get_hcp_class_mapping( hcp_info_pro, hcp_tech_class, doctorid_uq) content_lb_pop = content_lb_pop[content_lb_pop.content_title.isin(wecall_content)] print("------------------------------------------------------") print("Step 7: Generating HCP Personal Recommendation List") o2 = output2.copy() o2["Ratio"] = o2.Ratio.apply(p2f) output5 = pd.DataFrame() for doc_id in doctorid_uq: test = pd.DataFrame(np.nan, index=range(0, 5), columns=[ "doctorid", "rec_cnt", "method1", "method2", "method3"]) test["method1"] = content_lb_pop[~content_lb_pop["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) ################################################################################################### try: inst_list = get_most_interest_keyword(o2, doc_id) personal_rec = content_lb_pop[content_lb_pop[inst_list].any(1)] test["method2"] = personal_rec[~personal_rec["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) except IndexError: test["method2"] = np.nan ################################################################################################### try: hcp_class_content = get_hcp_class( hcp_tech_class, hcp_class_mapping, doc_id, content_lb_pop) except IndexError: hcp_class_content = pd.DataFrame( columns=["content_title", "popularity"]) test["method3"] = hcp_class_content[~hcp_class_content["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) test["doctorid"] = doc_id test["rec_cnt"] = test.index + 1 output5 = output5.append(test) output5 = output5.reset_index(drop=True) url = article_url[["title","url"]] output5_1 = output5.merge(url,left_on=["method1"],right_on="title",how="left") output5_1 = output5_1.merge(url,left_on=["method2"],right_on="title",how="left",suffixes=("_1","_2")) output5_1 = output5_1.merge(url,left_on=["method3"],right_on="title",how="left",suffixes=("_1","_2")) col_drop = ["title_1","title_1","title"] output5_1.drop(columns=col_drop,axis=1,inplace=True) col_left =[ "doctorid","rec_cnt","method1","method2","method3","url_1","url_2","url"] output5_1 = output5_1[col_left] output5_1.rename(columns={"url":"url_3"},inplace=True) print("Step 7: Done") print("------------------------------------------------------") print("ALL COMPLETE") nn.write_table(output1,'hcp_channel_preference',iotype = iotype) nn.write_table(output2,'hcp_content_interest',iotype = iotype) nn.write_table(output3,'hcp_content_interest_keyword',iotype = iotype) nn.write_table(output4,'hcp_reading_history',iotype = iotype) nn.write_table(output5_1,'hcp_recommendation',iotype = iotype) return(1)
def main(): print("Designed for WeCall Mini Progarm Article RecSys") print("------------------------------------------------------") print("Step 1: Loading necessary data") dm = DataManager() status = nn.Dataframefactory("wecall_doctor", iotype=iotype)[["code", "status"]] status.columns = ["doctorid", "status"] # 数据读取 wecall_behavior = dm.get_wecall_behavior # 行为数据读取 wecall_content = dm.get_wecall_content_tag # 文章库读取 hcp_market_title = dm.get_hcp_market_title # 读取市场地区数据 doc_list = dm.get_wecall_doctor # 读取医生列表 all_behavior_data = dm.get_all_behavior behavior_content_tag = dm.get_behavior_content_tag wecall_content_tag = dm.get_wecall_content_tag hcp_brand = dm.get_hcp_market_mapping content_brand = dm.get_wecall_article_brand content_brand = content_brand.rename(columns={'document_id': 'content_id'}) wecall_url = dm.get_wecall_url print("Step 1: Done") print("------------------------------------------------------") print("Step 2: Processing necessary data") # 数据处理 all_content = wecall_content[['content_id', 'content_title']] # 文章库有效列 all_content = all_content.drop_duplicates(['content_id']) # 文章去重 wecall_behavior[ 'content_id'] = dm.get_wecall_behavior.content_id.str.split( pat=".", n=1, expand=True)[0] # 行为数据doctorid统一格式 ### may not useful # Computes the most popular items 得到文章库的受欢迎程度排序 behavior_popularity_df = wecall_behavior.groupby(['doctorid', 'content_id'])['strength'].sum(). \ sort_values(ascending=False).reset_index() item_popularity_df = wecall_behavior.groupby([ 'content_id' ])['strength'].sum().sort_values(ascending=False).reset_index() # 和文章内容合并 all_content_merge = all_content.merge(item_popularity_df, how="left", on="content_id") all_content_merge = all_content_merge.fillna(0) all_behavior_merge = all_content.merge(behavior_popularity_df, how="left", on="content_id") all_behavior_merge = all_behavior_merge.fillna(0) all_behavior_merge = all_behavior_merge.merge(content_brand, how='left', on="content_id") popularity_df = all_content_merge.groupby('content_id')['strength'].sum( ).sort_values(ascending=False).reset_index() popularity_df = popularity_df.merge(content_brand, how='left', on="content_id") print("Step 2: Done") print("------------------------------------------------------") print("Step 3: Generating Recommendation by popularity") ### Method 3 ---- Popularity Ranking start1 = time.time() popularity_model = PopularityRecommender(all_behavior_merge, popularity_df) # 输入 doctor_list = DataFrame(doc_list) # 推荐医生的列表 doctor_list = doctor_list.rename(columns={0: 'doctor_id'}) doctor_list = doctor_list.merge(hcp_brand, how='left', on='doctor_id') method3_final = popularity_model.deliver_final(doctor_list) # =doctor_list['doctor_id'], brand_id= doctor_list['brand_id'] end1 = time.time() running_time1 = end1 - start1 print('time cost : %.5f sec' % running_time1) print("Step 3: Done") print("------------------------------------------------------") print("Step 4: Generating Recommendation by Colleague") ### Method 2 ---- Colleague Recommendation start2 = time.time() cl_rc = ColleagueRcsys(wecall_behavior, hcp_market_title, hcp_brand, content_brand, doc_list, all_content) method2_final = cl_rc.delivery_final() method2_final = method2_final[[ 'doctor_id', 'content_id', 'strength', 'method' ]] end2 = time.time() running_time2 = end2 - start2 print('time cost : %.5f sec' % running_time2) print("Step 4: Done") print("------------------------------------------------------") print("Step 5: Generating Recommendation by Guess what you Like") ### Method 1 ---- guess what you like start3 = time.time() method1_final = recommand(all_behavior_data, behavior_content_tag, wecall_content_tag, hcp_brand, content_brand, doc_list) method1_final = method1_final[[ 'doctor_id', 'content_id', 'strength', 'method' ]] end3 = time.time() running_time3 = end3 - start3 print('time cost : %.5f sec' % running_time3) print("Step 5: Done") print("------------------------------------------------------") print("Step 6: Generating Final Recommendation Result") start4 = time.time() final_recommend = method1_final.append(method2_final) final_recommend_new = final_recommend.groupby( ['doctor_id', 'content_id'])['method'].min().reset_index() final_recommend_new = final_recommend_new.groupby(['doctor_id', 'method']).head(5) final_recommend_2 = final_recommend_new.append(method3_final) popularity_df_update = popularity_df[['content_id', 'strength']].drop_duplicates() final_output = final_recommend_2.groupby(['doctor_id', 'content_id' ])['method'].min().reset_index() final_output = final_output.merge(popularity_df_update, how='left', on='content_id') # final_with_url = final_output.merge(wecall_url, how='left', on='content_id') # print("add url") #final_output.to_csv('stage01.csv') df1 = final_output[final_output['method'] == 1] df2 = final_output[final_output['method'] == 2] df3 = final_output[final_output['method'] == 3] t = Transformer(df1) x1 = t.getDataframe() if df2.empty: x2 = DataFrame( [], columns=['doctor_id', 'xn1', 'content_id', 'method', 'strength']) else: t.setDataframe(df2) x2 = t.getDataframe() t.setDataframe(df3) x3 = t.getDataframe() xf = x3.merge(x2, on=['doctor_id', 'xn1'], how='left').merge(x1, on=['doctor_id', 'xn1'], how='left') xf['createtime'] = time.strftime("%m/%d/%Y %H:%M", time.localtime()) xf = xf.rename( columns={ 'doctor_id': 'doctorid', 'xn1': 'rec_cnt', 'content_id_x': 'm1_id', 'content_id': 'm2_id', 'content_id_y': 'm3_id' }) xf1 = xf.merge(wecall_url, how='left', left_on='m1_id', right_on='content_id').rename(columns={ 'content_title': 'method1', 'url': 'url_1' }) xf2 = xf1.merge(wecall_url, how='left', left_on='m2_id', right_on='content_id').rename(columns={ 'content_title': 'method2', 'url': 'url_2' }) xf3 = xf2.merge(wecall_url, how='left', left_on='m3_id', right_on='content_id').rename(columns={ 'content_title': 'method3', 'url': 'url_3' }) xf_final = xf3[[ 'doctorid', 'rec_cnt', 'method1', 'm1_id', 'method2', 'm2_id', 'method3', 'm3_id', 'url_1', 'url_2', 'url_3', 'createtime' ]] xf_final["method1"] = xf_final["method1"].str.replace(',', ',', regex=False) xf_final["method2"] = xf_final["method2"].str.replace(',', ',', regex=False) xf_final["method3"] = xf_final["method3"].str.replace(',', ',', regex=False) end4 = time.time() running_time4 = end4 - start4 print('time cost : %.5f sec' % running_time4) print("Step 6: Done") #print("------------------------------------------------------") #print("Writing Table to Hive") #nn.write_table(xf_final,"rec_out",iotype=iotype) xf_final['Index'] = xf_final.index xf_final = xf_final.merge(status, how="left", on="doctorid") print("------------------------------------------------------") print('Writing Table to MySql') nn.write_mysql_table(xf_final, 'rec_out', 'mysql_con') print("All Done") return (1)