示例#1
0
def main():
    tag = nn.Dataframefactory("hcp_tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    wechat = nn.Dataframefactory("wechat", iotype=iotype)

    web = nn.Dataframefactory("web", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat, web)
    if cbindBehavData.shape[0] == 0:
        print("ERROR!!!")
        print("NO VALID DATA IS PREPARED! PLEASE CHECK THE RAW DATA.")
        print()
    else:
        doctorList = list(set(cbindBehavData["doctorid"]))
        print("Finished Data preparation")

        contentTitle = cbindBehavData['content_title'].dropna(
        ).drop_duplicates().to_frame()
        contentLabeled = titleLabeling(contentTitle, mapping)
        allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                    left_on='content_title',
                                                    right_on='content_title')
        allBehavDataLabelled["month_id"] = allBehavDataLabelled[
            "start_date"].apply(getMonthId)
        validBehavDataLabelled = allBehavDataLabelled[
            allBehavDataLabelled.lv2_tag.str.len() != 0]

        # calculate the heatmap data and chord diagram data
        heatMapPart = []
        chordMapPart = []
        print("Begin calculating")

        for docid in doctorList:
            segBehavData = validBehavDataLabelled[
                validBehavDataLabelled["doctorid"] == docid]
            if segBehavData.shape[0] != 0:
                segHeatData = statsBySegment(segBehavData, docid)
                heatMapPart.append(segHeatData)
                segChordData = chordStatsBySeg(segBehavData, docid)
                if segChordData.shape[0] != 0:
                    chordMapPart.append(segChordData)

        heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
        chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
        print("Finished calculating")

        nn.write_table(heatMapOutput, 'hcp_heatmap', iotype=iotype)
        # hcp_heatmap structure: four columns - doctorid, month_id, tag_name, tag_count
        nn.write_table(chordMapOutput, 'hcp_chordmap', iotype=iotype)
        # hcp_chordmap structure: four columns - doctorid, point_one, point_two, count

        return (1)
示例#2
0
def load():
    print("Designed for Novo4PE: Content Rec Sys")
    print("------------------------------------------------------")
    print("Step 1: loading necessary data")
    #behavior_raw = pd.read_csv(
    #    '../essentials/itangyi_wechat_hcp_content_view')
    behavior_raw = nn.Dataframefactory('wechat', iotype=iotype)

    #content_lib_pat = pd.read_csv(
    #    "../essentials/idiabetes_patient_history_article")
    global content_lib_pat
    content_lib_pat = nn.Dataframefactory('pat_articles', iotype=iotype)

    #content_lib_hcp = pd.read_csv(
    #    "../essentials/idiabetes_hcp_history_article.csv")
    global content_lib_hcp
    content_lib_hcp = nn.Dataframefactory('hcp_articles', iotype=iotype)
    print("Step 1: Done")
    print("------------------------------------------------------")
    print("Step 2: Creating dictionary")
    create_dict_stop()
    print("Step 2: Done")
    print("------------------------------------------------------")
    print("Step 3: Processing Content_Lib and Behavior Data")
    global content_lib
    content_lib = content_lib_processing(content_lib_pat, content_lib_hcp)
    global behavior_all_indexed
    behavior_all_indexed = interaction_processing(behavior_raw)
    print("Step 3: Done")
    print("------------------------------------------------------")
    print("Step 4: Caculating Cotent Attribute")
    global corpus_list
    corpus_list = corpus_process(content_lib)
    gen_tfidf_matrix(corpus_list)

    global pat_tfidf_matrix, hcp_tfidf_matrix, pat_content_title, hcp_content_title
    pat_tfidf_matrix, hcp_tfidf_matrix, pat_content_title, hcp_content_title = split_tfidf_matrix(
        content_lib)
    global most_viewed
    most_viewed = get_ranked_content_title(behavior_all_indexed, content_lib)

    print("Step 4: Done")
    print("------------------------------------------------------")
    print("Step 5: Generating Personal Rec List")
    global uniq_usr
    uniq_usr = behavior_raw.hcp_openid_u_2.unique()

    # for usr_id in tqdm(uniq_usr):
    #     rec = generate_usr_rec(usr_id, behavior_all_indexed, content_lib, pat_tfidf_matrix,
    #                            pat_content_title, hcp_tfidf_matrix, hcp_content_title, most_viewed)

    print("Step 5: Done")
    print("------------------------------------------------------")
    print("LOAD COMPLETE")

    return ('ALGORITHM LOADING COMPLETE')
示例#3
0
    def __init__(self):
        print("Preparing WeCall DataManager")
        self.__tokenizer = Seg()
        self.__tagger = TagSys()

        self.__behavior_raw = nn.Dataframefactory("behavior", iotype=iotype)
        self.__web_raw = nn.Dataframefactory("web", iotype=iotype)
        self.__wecall_content = nn.Dataframefactory("wecall_article",
                                                    iotype=iotype)
        self.__wecall_doctor = nn.Dataframefactory("wecall_doctor",
                                                   iotype=iotype)
        self.__wecall_sales = nn.Dataframefactory("wecall_sales",
                                                  iotype=iotype)
        self.__novo_hcp_info = nn.Dataframefactory("novo_hcp", iotype=iotype)
        self.__novo_market_lv = nn.Dataframefactory("novo_hcp_market",
                                                    iotype=iotype)
        self.__wecall_article_brand = nn.Dataframefactory("wecall_brand",
                                                          iotype=iotype)
        self.__wecall_content_detail = nn.Dataframefactory("wecall_detail",
                                                           iotype=iotype)

        self.__cooked_behavior, self.__wecall_bev = self.__behavior_data_process(
        )
        self.__hcp_behavior = self.__make_hcp_behavior()
        self.__behavior_content_tag = self.__make_behavior_content_tag()
        self.__wecall_content_tag = self.__make_wecall_content_tag()
        self.__wecall_hcp_market_mapping = self.__make_wecall_market_mapping()
        self.__wecall_behavior = self.__make_wecall_behavior()
        self.__hcp_title_market_info = self.__make_hcp_market_title()
        self.__wecall_content_brand = self.__make_wecall_content_brand()
        self.__wecall_content_url = self.__make_wecall_content_url()

        print("WeCall DataManager's ready to serve")
示例#4
0
def main():
    tag = nn.Dataframefactory("pat_tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    wechat = nn.Dataframefactory("wechat", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat)
    patList = list(set(cbindBehavData["hcp_openid_u_2"]))
    print("Finished Data preparation")

    contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates(
    ).to_frame()
    contentLabeled = titleLabeling(contentTitle, mapping)
    print(contentLabeled)
    allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                left_on='content_title',
                                                right_on='content_title')
    allBehavDataLabelled["month_id"] = allBehavDataLabelled[
        "start_date"].apply(getMonthId)
    validBehavDataLabelled = allBehavDataLabelled[
        allBehavDataLabelled.lv2_tag.str.len() != 0]

    # calculate the heatmap data and chord diagram data
    heatMapPart = []
    chordMapPart = []
    print("Begin calculating")

    for openID in patList:
        segBehavData = validBehavDataLabelled[
            validBehavDataLabelled["hcp_openid_u_2"] == openID]
        if segBehavData.shape[0] != 0:
            segHeatData = statsBySegment(segBehavData, openID)
            heatMapPart.append(segHeatData)
            segChordData = chordStatsBySeg(segBehavData, openID)
            if segChordData.shape[0] != 0:
                chordMapPart.append(segChordData)

    heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
    chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
    print("Finished calculating")

    nn.write_table(heatMapOutput, 'pat_heatmap', iotype=iotype)
    # pat_heatmap structure: four columns - openID, month_id, tag_name, tag_count
    nn.write_table(chordMapOutput, 'pat_chordmap', iotype=iotype)
    # pat_chordmap structure: four columns - openID, point_one, point_two, count

    return (1)
示例#5
0
    def __init__(self, option="hcp"):
        self.similar_words = nn.Dataframefactory('similar', iotype=iotype)

        if option == "hcp":
            self.tag = nn.Dataframefactory('hcp_tag', iotype=iotype)
            self.lb = "医生标签"
        elif option == "pat":
            self.tag = nn.Dataframefactory('pat_tag', iotype=iotype)
            self.lb = "病人标签"
        else:
            print('option is only for pat or hcp')
            raise Exception
        self.mapping = self.__create_mapping()
        self.word_tag = self.__get_word_tag()
示例#6
0
def main():
    # pre-define path & variables
    corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url'))
    vector = "vectorizer.joblib"
    matrix = "tfidf.npy"
    outpath = nnenv.getResourcePath() 
    
    
    # load dict and stopwords
    createDictStop()
    
    # load corpus/
    corpus = combineTitleAndContent(corpus_raw)
    

    # save content_id mapping
    content_id_mapping = corpus[["content_id"]]
    content_id_mapping.index.name = 'index'
    content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) 


    # transform corpus to right format
    corpus["corpus"] = corpus["all"].apply(segment)
    
    #create tfidf-matrix and vectorizer
    tfidfMatrix, vectorizer = createTfidfMatrix(corpus)
    
    #save esstenial files
    with open(outpath + vector, 'wb') as f:
        joblib.dump(vectorizer, f)
    
    np.save(outpath + matrix, tfidfMatrix)
    
    print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
示例#7
0
def createDictStop():
    """
    Load external dictionary and stop words
    """
    print("Loading Dictionary and Stopwords")
    global stopWord
    dic = nn.Dataframefactory('mappingword',sep = '/r/n',iotype=iotype)
    word = dic.word.tolist()
    stopWord = nn.Dataframefactory('stopword',sep = '/r/n',iotype=iotype) 
    stopWord = stopWord.word.tolist()
    stopWord.append(" ")
    jieba.re_han_default = re.compile(r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/β/α/-]+)', re.UNICODE)
    frequnecy = 100000000000000000000000
    # Add words to dictionary
    for words in word:
        jieba.add_word(words, freq=frequnecy)
    print("Finished Dic Loading")
示例#8
0
    def __load_dict(self):
        jieba.re_han_default = re.compile(
            r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/”/“/"/β/α/-]+)', re.UNICODE)
        dictionary = nn.Dataframefactory('mappingword',
                                         sep='\r\n',
                                         iotype=iotype)

        for x in dictionary.word:
            jieba.add_word(x, self.freq)
示例#9
0
    def loading_everything():

        global tag, similar, mapping, clf, tfidf_matrix, labeled_corpus, title_list, content_id_mapping
        createDictStop()
        tag = nn.Dataframefactory('tag', iotype='fs')
        similar = nn.Dataframefactory('similar', iotype='fs')
        mapping = mappingCbind(similar, tag)

        clf = nn.Joblibfactory(nnenv.getItem('vectorizer'))
        tfidf_matrix = nn.Numpyarrayfactory(nnenv.getItem('tfidf'))

        labeled_corpus = nn.Dataframefactory('labeledContent',
                                             sep='|',
                                             iotype='fs',
                                             con=nnenv.getItem('mysql_url'))
        title_list = labeled_corpus.title.tolist()

        content_id_mapping = nn.Dataframefactory('content_id_mapping',
                                                 iotype='fs')
示例#10
0
def input_check(table):
    try:
        df = nn.Dataframefactory(table, iotype=iotype)
        if df.empty:
            raise ValueError("table {} is empty".format(table))
        return df
    except Exception as e:
        print(
            "There was an error in your input table, Please double check your data source\n:{}"
            .format(e))
示例#11
0
def createDictStop():
    """
    Load external dictionary and stop words
    """
    print("Loading Dictionary and Stopwords")
    global stopWord
    dic = nn.Dataframefactory('mappingword',sep = '\r\n',iotype = iotype)
    #stopWord = pd.read_csv(dic_path + 'StopWordFinal.txt',
    #                       encoding='utf-8', engine='python', sep='\r\n')
    stopWord = nn.Dataframefactory('stopword',sep = '\r\n', iotype = iotype)
    word = dic.word.tolist()
    word = dic.word.tolist()
    stopWord = stopWord.word.tolist()
    jieba.re_han_default = re.compile(
        r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/β/α/-]+)', re.UNICODE)
    frequnecy = 1000000000000000000000
    # Add words to dictionary
    for words in word:
        jieba.add_word(words, frequnecy)
    print("Dictionary and StopWord have been loaded")
示例#12
0
def main():
    raw = nn.Dataframefactory("pat_call_center", iotype=iotype)
    mapping = nn.Dataframefactory("pat_call_mapping", iotype=iotype)

    print("Begin aggregating patient questions")
    patQuesDf = sepQuestions(raw)
    print("Patient questions prepared")

    print("Begin calculating")
    quesMerge = pd.merge(patQuesDf,
                         mapping,
                         how="left",
                         left_on="customer_question",
                         right_on="question")
    output = quesMerge[[
        "patient_id", "customer_question", "question_category",
        "question_sub_category", "product_type"
    ]]
    print("Finished calculating")

    nn.write_table(output, 'pat_call_center_stats', iotype=iotype)
    # pat_call_center_stats structure: five columns - patient_id, customer_question, question_category, question_sub_category, product_type

    return (1)
示例#13
0
def create_dict_stop():
    """
    Load external dictionary and stop words
    """
    print("Loading Dictionary and Stopwords")
    global stopWord
    dic_path = '../essentials/'
    #dic = pd.read_csv(dic_path + "mappingWordFinal.txt",
    #                  encoding='utf-8', engine='python', sep='\r\n')
    dic = nn.Dataframefactory("mappingword", sep='\r\n', iotype=iotype)
    word = dic.word.tolist()
    #
    #stopWord = pd.read_csv(dic_path + 'StopWordFinal.txt',
    #                       encoding='utf-8', engine='python', sep='\r\n')
    stopWord = nn.Dataframefactory("stopword", iotype=iotype)
    stopWord = stopWord.word.tolist()
    stopWord.append(" ")
    jieba.re_han_default = re.compile(
        r'([\u0020\u4e00-\u9fa5a-zA-Z0-9+#&._%/”/“/"/β/α/-]+)', re.UNICODE)
    frequnecy = 100000000000000000000000
    # Add words to dictionary
    for words in word:
        jieba.add_word(words, freq=frequnecy)
    print("Finished Dic Loading")
示例#14
0
    def __load_stopword(self):
        stop = nn.Dataframefactory("stopword", sep='\r\n', iotype=iotype)

        self.stopwords = stop.word.tolist()
示例#15
0
import nndw
import nnenv

df = nndw.Dataframefactory()
示例#16
0
def main():
    tag = nn.Dataframefactory("tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    novoHcpAgg = nn.Dataframefactory("hcp_ability_detailing", iotype=iotype)

    doctorList = list(set(novoHcpAgg["customer_code"]))

    wechat = nn.Dataframefactory("wechat", iotype=iotype)
    web = nn.Dataframefactory("web", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat, web, doctorList)
    print("Finished Data preparation")

    contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates(
    ).to_frame()
    contentLabeled = titleLabeling(contentTitle, mapping)
    allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                left_on='content_title',
                                                right_on='content_title')
    allBehavDataLabelled["month_id"] = allBehavDataLabelled[
        "start_date"].apply(getMonthId)
    validBehavDataLabelled = allBehavDataLabelled[
        allBehavDataLabelled.lv2_tag.str.len() != 0]

    # segment mapping file, write this table to Hive
    allCbindDf = cbindAllConditions(novoHcpAgg)
    print("Created segment mapping file")

    # do lv2 tag stats and get the top 15 labels
    allLv2Stats = statsByLevel(validBehavDataLabelled, "lv2_tag")
    topLv2LabelsDf = getTopNLabels(allLv2Stats, 15)
    print("Found top 15 tags of all doctors")

    # for seg in all segments, for each month in all months in the segment
    # calculate the heatmap data and chord diagram data
    heatMapPart = []
    chordMapPart = []
    print("Begin calculating")

    for segId in allCbindDf["segment_id"]:
        segDocList = getSegDoctorList(allCbindDf, novoHcpAgg, segId)
        if len(segDocList) != 0:
            segBehavData = validBehavDataLabelled[
                validBehavDataLabelled["doctorid"].isin(segDocList)]
            if segBehavData.shape[0] != 0:
                segHeatData = statsBySegment(segBehavData, segId,
                                             topLv2LabelsDf)
                heatMapPart.append(segHeatData)
                segChordData = chordStatsBySeg(segBehavData, segId)
                if segChordData.shape[0] != 0:
                    chordMapPart.append(segChordData)

    heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
    chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
    print("Finished calculating")

    objNovoHcpAgg = novoHcpAgg.astype("object")
    mergeSegID = pd.merge(
        objNovoHcpAgg,
        allCbindDf,
        how="left",
        left_on=["detailing_path_id", "level", "academic_title", "department"],
        right_on=["detailing_path", "hcp_segment", "title", "department"])
    customerCodeSegId = mergeSegID[["customer_code", "segment_id"]]

    nn.write_table(heatMapOutput, 'heatmap', iotype=iotype)
    nn.write_table(chordMapOutput, 'chordmap', iotype=iotype)
    nn.write_table(allCbindDf, 'segmentmapping', iotype=iotype)
    nn.write_table(customerCodeSegId, 'customerCodeSegId', iotype=iotype)

    return (1)
示例#17
0
def main():
    print("Designed for Novo4PE-Pilot")
    print("------------------------------------------------------")
    print("Step 1: loading necessary data")
    
    

    tag = nn.Dataframefactory('tag',iotype = iotype)
    #similar = pd.read_csv('./essential/tag_similar_words.csv')
    similar = nn.Dataframefactory('similar',iotype = iotype)
    mapping = mappingCbind(similar, tag)

    #wechat = pd.read_excel("./essential/wechat_mengbo.xlsx")
    wechat = nn.Dataframefactory('wechat',iotype = iotype)
    wecall = wechat[wechat.module_2.isin(["WeCall 2.0","WeCall 1.0"])]
    wecall_content = wecall.content_title.unique()
    #web = pd.read_excel("./essential/web_mengbo.xlsx")
    web = nn.Dataframefactory('web',iotype = iotype)

    #novo_hcp = pd.read_csv("./essential/novo_hcp")
    novo_hcp = nn.Dataframefactory('novo_hcp',iotype = iotype)

    #novo_market = pd.read_csv("./essential/novo_hcp_market")
    novo_market = nn.Dataframefactory('novo_hcp_market',iotype = iotype)
    article_url = nn.Dataframefactory('article_url',iotype = iotype)
    print("Step 1: Done")
    print("------------------------------------------------------")
    print("Step 2: Creating dictionary")
    createDictStop()
    print("Step 2: Done")
    print("------------------------------------------------------")
    print("Step 3: Processing Raw Data")
    wechatFilterd, webFilterd, validWechatLog, validWebLog, contentPrefData, LogData = dataPrepare(
        wechat, web)
    print("Step 3: Done")
    print("------------------------------------------------------")
    print("Step 4: Caculating Channel Preference")
    output1 = channelPref(wechatFilterd, webFilterd)
    print("Step 4: Done")
    cotentTitle = contentPrefData['content_title'].dropna(
    ).drop_duplicates().to_frame()
    contentLabeled = titleLabeling(cotentTitle, mapping)
    contentNew = contentPrefData.merge(
        contentLabeled, left_on='content_title', right_on='content_title')
    print("------------------------------------------------------")
    print("Step 5: Caculating HCP Content Preference and Interest Point")
    output2 = pd.DataFrame()
    output3 = pd.DataFrame()
    for dc_id in contentNew.doctorid.unique():
        contentInsteret, otherTags, lb, labelMap = calContInst(
            contentNew, dc_id)
        keywordCnt = calContKeyWord(
            contentNew, dc_id, lb, otherTags, labelMap, mapping)
        output2 = output2.append(contentInsteret)
        output3 = output3.append(keywordCnt)
    output2.reset_index(drop=True, inplace=True)
    output3.reset_index(drop=True, inplace=True)
    print("Step 5: Done")
    print("------------------------------------------------------")
    print("Step 6: Caculating HCP reading History")
    webHistWithoutToken = webHistWithoutTokens(validWebLog)
    wechathistWithoutToken = wechatHistWithoutTokens(validWechatLog)
    output4 = readingHist(webHistWithoutToken,
                          wechathistWithoutToken, contentLabeled)
    print("Step 6: Done")
    content_uq = get_content_uniq(LogData)
    hcp_reading_history = get_hcp_reading_history(LogData)
    doctorid_uq = get_uniq_doctorid(LogData)
    hcp_lb_uq = get_hcp_label_uniq(mapping)
    content_lb = contentLabeled[["content_title", "HCP标签"]]
    content_lb_pop = content_lb.merge(
        content_uq[["content_id", "content_title", "popularity"]], on="content_title")
    # 更新表结构 !!!HCP标签的 好计算
    content_lb_pop[hcp_lb_uq] = content_lb_pop["HCP标签"].apply(
        create_var, args=(hcp_lb_uq,))
    hcp_tech_class, hcp_info_pro = get_hcp_tech_class(
        novo_hcp, novo_market, doctorid_uq, hcp_reading_history)
    content_pop = content_lb_pop[["content_title", "popularity"]]
    hcp_class_mapping = get_hcp_class_mapping(
        hcp_info_pro, hcp_tech_class, doctorid_uq)

    content_lb_pop = content_lb_pop[content_lb_pop.content_title.isin(wecall_content)]
    print("------------------------------------------------------")
    print("Step 7: Generating HCP Personal Recommendation List")
    o2 = output2.copy()
    o2["Ratio"] = o2.Ratio.apply(p2f)
    output5 = pd.DataFrame()
    for doc_id in doctorid_uq:
        test = pd.DataFrame(np.nan, index=range(0, 5), columns=[
                            "doctorid", "rec_cnt", "method1", "method2", "method3"])

        test["method1"] = content_lb_pop[~content_lb_pop["content_title"].isin(hcp_reading_history.get(doc_id))] \
            .sort_values('popularity', ascending=False) \
            .head(5) \
            .content_title \
            .reset_index(drop=True)
    ###################################################################################################
        try:
            inst_list = get_most_interest_keyword(o2, doc_id)
            personal_rec = content_lb_pop[content_lb_pop[inst_list].any(1)]
            test["method2"] = personal_rec[~personal_rec["content_title"].isin(hcp_reading_history.get(doc_id))] \
                .sort_values('popularity', ascending=False) \
                .head(5) \
                .content_title \
                .reset_index(drop=True)
        except IndexError:
            test["method2"] = np.nan
    ###################################################################################################
        try:
            hcp_class_content = get_hcp_class(
                hcp_tech_class, hcp_class_mapping, doc_id, content_lb_pop)
        except IndexError:
            hcp_class_content = pd.DataFrame(
                columns=["content_title", "popularity"])

        test["method3"] = hcp_class_content[~hcp_class_content["content_title"].isin(hcp_reading_history.get(doc_id))] \
            .sort_values('popularity', ascending=False) \
            .head(5) \
            .content_title \
            .reset_index(drop=True)
        test["doctorid"] = doc_id
        test["rec_cnt"] = test.index + 1

        output5 = output5.append(test)

    output5 = output5.reset_index(drop=True)
    url = article_url[["title","url"]]    
    output5_1 = output5.merge(url,left_on=["method1"],right_on="title",how="left")
    output5_1 = output5_1.merge(url,left_on=["method2"],right_on="title",how="left",suffixes=("_1","_2"))
    output5_1 = output5_1.merge(url,left_on=["method3"],right_on="title",how="left",suffixes=("_1","_2"))
    col_drop = ["title_1","title_1","title"]
    output5_1.drop(columns=col_drop,axis=1,inplace=True)
    col_left =[ "doctorid","rec_cnt","method1","method2","method3","url_1","url_2","url"] 
    output5_1 = output5_1[col_left]
    output5_1.rename(columns={"url":"url_3"},inplace=True)
    print("Step 7: Done")
    print("------------------------------------------------------")
    print("ALL COMPLETE")

    nn.write_table(output1,'hcp_channel_preference',iotype = iotype)
    nn.write_table(output2,'hcp_content_interest',iotype = iotype)
    nn.write_table(output3,'hcp_content_interest_keyword',iotype = iotype)
    nn.write_table(output4,'hcp_reading_history',iotype = iotype)
    nn.write_table(output5_1,'hcp_recommendation',iotype = iotype)

    return(1)
示例#18
0
def main():
    print("Designed for WeCall Mini Progarm Article RecSys")
    print("------------------------------------------------------")
    print("Step 1: Loading necessary data")

    dm = DataManager()
    status = nn.Dataframefactory("wecall_doctor",
                                 iotype=iotype)[["code", "status"]]
    status.columns = ["doctorid", "status"]
    # 数据读取
    wecall_behavior = dm.get_wecall_behavior  # 行为数据读取
    wecall_content = dm.get_wecall_content_tag  # 文章库读取
    hcp_market_title = dm.get_hcp_market_title  # 读取市场地区数据
    doc_list = dm.get_wecall_doctor  # 读取医生列表
    all_behavior_data = dm.get_all_behavior
    behavior_content_tag = dm.get_behavior_content_tag
    wecall_content_tag = dm.get_wecall_content_tag
    hcp_brand = dm.get_hcp_market_mapping
    content_brand = dm.get_wecall_article_brand
    content_brand = content_brand.rename(columns={'document_id': 'content_id'})
    wecall_url = dm.get_wecall_url

    print("Step 1: Done")
    print("------------------------------------------------------")
    print("Step 2: Processing necessary data")
    # 数据处理
    all_content = wecall_content[['content_id', 'content_title']]  # 文章库有效列
    all_content = all_content.drop_duplicates(['content_id'])  # 文章去重
    wecall_behavior[
        'content_id'] = dm.get_wecall_behavior.content_id.str.split(
            pat=".", n=1,
            expand=True)[0]  # 行为数据doctorid统一格式 ### may not useful

    # Computes the most popular items 得到文章库的受欢迎程度排序
    behavior_popularity_df = wecall_behavior.groupby(['doctorid', 'content_id'])['strength'].sum(). \
        sort_values(ascending=False).reset_index()
    item_popularity_df = wecall_behavior.groupby([
        'content_id'
    ])['strength'].sum().sort_values(ascending=False).reset_index()

    # 和文章内容合并
    all_content_merge = all_content.merge(item_popularity_df,
                                          how="left",
                                          on="content_id")
    all_content_merge = all_content_merge.fillna(0)

    all_behavior_merge = all_content.merge(behavior_popularity_df,
                                           how="left",
                                           on="content_id")
    all_behavior_merge = all_behavior_merge.fillna(0)

    all_behavior_merge = all_behavior_merge.merge(content_brand,
                                                  how='left',
                                                  on="content_id")
    popularity_df = all_content_merge.groupby('content_id')['strength'].sum(
    ).sort_values(ascending=False).reset_index()
    popularity_df = popularity_df.merge(content_brand,
                                        how='left',
                                        on="content_id")

    print("Step 2: Done")
    print("------------------------------------------------------")
    print("Step 3: Generating Recommendation by popularity")
    ### Method 3 ---- Popularity Ranking

    start1 = time.time()
    popularity_model = PopularityRecommender(all_behavior_merge,
                                             popularity_df)  # 输入
    doctor_list = DataFrame(doc_list)  # 推荐医生的列表
    doctor_list = doctor_list.rename(columns={0: 'doctor_id'})
    doctor_list = doctor_list.merge(hcp_brand, how='left', on='doctor_id')
    method3_final = popularity_model.deliver_final(doctor_list)

    # =doctor_list['doctor_id'], brand_id= doctor_list['brand_id']
    end1 = time.time()
    running_time1 = end1 - start1
    print('time cost : %.5f sec' % running_time1)
    print("Step 3: Done")
    print("------------------------------------------------------")
    print("Step 4: Generating Recommendation by Colleague")
    ### Method 2 ---- Colleague Recommendation
    start2 = time.time()
    cl_rc = ColleagueRcsys(wecall_behavior, hcp_market_title, hcp_brand,
                           content_brand, doc_list, all_content)
    method2_final = cl_rc.delivery_final()
    method2_final = method2_final[[
        'doctor_id', 'content_id', 'strength', 'method'
    ]]
    end2 = time.time()
    running_time2 = end2 - start2
    print('time cost : %.5f sec' % running_time2)
    print("Step 4: Done")
    print("------------------------------------------------------")
    print("Step 5: Generating Recommendation by Guess what you Like")
    ### Method 1 ---- guess what you like
    start3 = time.time()
    method1_final = recommand(all_behavior_data, behavior_content_tag,
                              wecall_content_tag, hcp_brand, content_brand,
                              doc_list)
    method1_final = method1_final[[
        'doctor_id', 'content_id', 'strength', 'method'
    ]]
    end3 = time.time()
    running_time3 = end3 - start3
    print('time cost : %.5f sec' % running_time3)
    print("Step 5: Done")
    print("------------------------------------------------------")
    print("Step 6: Generating Final Recommendation Result")
    start4 = time.time()
    final_recommend = method1_final.append(method2_final)
    final_recommend_new = final_recommend.groupby(
        ['doctor_id', 'content_id'])['method'].min().reset_index()
    final_recommend_new = final_recommend_new.groupby(['doctor_id',
                                                       'method']).head(5)
    final_recommend_2 = final_recommend_new.append(method3_final)
    popularity_df_update = popularity_df[['content_id',
                                          'strength']].drop_duplicates()
    final_output = final_recommend_2.groupby(['doctor_id', 'content_id'
                                              ])['method'].min().reset_index()
    final_output = final_output.merge(popularity_df_update,
                                      how='left',
                                      on='content_id')
    # final_with_url = final_output.merge(wecall_url, how='left', on='content_id')
    # print("add url")
    #final_output.to_csv('stage01.csv')
    df1 = final_output[final_output['method'] == 1]
    df2 = final_output[final_output['method'] == 2]
    df3 = final_output[final_output['method'] == 3]
    t = Transformer(df1)
    x1 = t.getDataframe()
    if df2.empty:
        x2 = DataFrame(
            [],
            columns=['doctor_id', 'xn1', 'content_id', 'method', 'strength'])
    else:
        t.setDataframe(df2)
        x2 = t.getDataframe()
    t.setDataframe(df3)
    x3 = t.getDataframe()

    xf = x3.merge(x2, on=['doctor_id', 'xn1'],
                  how='left').merge(x1, on=['doctor_id', 'xn1'], how='left')
    xf['createtime'] = time.strftime("%m/%d/%Y %H:%M", time.localtime())
    xf = xf.rename(
        columns={
            'doctor_id': 'doctorid',
            'xn1': 'rec_cnt',
            'content_id_x': 'm1_id',
            'content_id': 'm2_id',
            'content_id_y': 'm3_id'
        })
    xf1 = xf.merge(wecall_url,
                   how='left',
                   left_on='m1_id',
                   right_on='content_id').rename(columns={
                       'content_title': 'method1',
                       'url': 'url_1'
                   })
    xf2 = xf1.merge(wecall_url,
                    how='left',
                    left_on='m2_id',
                    right_on='content_id').rename(columns={
                        'content_title': 'method2',
                        'url': 'url_2'
                    })
    xf3 = xf2.merge(wecall_url,
                    how='left',
                    left_on='m3_id',
                    right_on='content_id').rename(columns={
                        'content_title': 'method3',
                        'url': 'url_3'
                    })
    xf_final = xf3[[
        'doctorid', 'rec_cnt', 'method1', 'm1_id', 'method2', 'm2_id',
        'method3', 'm3_id', 'url_1', 'url_2', 'url_3', 'createtime'
    ]]
    xf_final["method1"] = xf_final["method1"].str.replace(',',
                                                          ',',
                                                          regex=False)
    xf_final["method2"] = xf_final["method2"].str.replace(',',
                                                          ',',
                                                          regex=False)
    xf_final["method3"] = xf_final["method3"].str.replace(',',
                                                          ',',
                                                          regex=False)
    end4 = time.time()
    running_time4 = end4 - start4
    print('time cost : %.5f sec' % running_time4)
    print("Step 6: Done")
    #print("------------------------------------------------------")
    #print("Writing Table to Hive")
    #nn.write_table(xf_final,"rec_out",iotype=iotype)
    xf_final['Index'] = xf_final.index
    xf_final = xf_final.merge(status, how="left", on="doctorid")
    print("------------------------------------------------------")
    print('Writing Table to MySql')
    nn.write_mysql_table(xf_final, 'rec_out', 'mysql_con')
    print("All Done")
    return (1)