def getFeatureRMSEAgainstBaseline(cols=['color_exist']): utility = Utility() utility.startTimeTrack() # This part skips the feature training and simply use it. print("len(cols):", len(cols), cols) print("Reading feature set") all_df = pd.read_csv('../data/features_doc2vec_sense2vec_pmi_20170418.csv') feature_train_df = all_df[:74067] # Must drop these columns for OrdinalRegression feature_train_df.drop('wm_product_brand', axis=1, inplace=True) cols.append('relevance_int') cols.append('id') cols.append('search_term') cols.append('product_uid') cols.append('relevance') cols.append('product_idx') cols.append('Word2VecQueryExpansion') print(cols) feature_train_df = feature_train_df.filter(items=cols, axis=1) feature_test_df = all_df[74067:] feature_test_df.drop('relevance', axis=1, inplace=True) utility.checkpointTimeTrack() print("#### Running: OrdinalRegression ordridge training ####") # dp=DataPreprocessing() print("feature_train_df:", list(feature_train_df)) # trainDF,validateDF=dp.generateValidationSet(train_df) orModel = OrdinalRegressionRanker('ordridge') orModel.train(feature_train_df, None) # orModel.gridSearch(feature_train_df, None) print("#### Completed: OrdinalRegression ordridge training ####") utility.checkpointTimeTrack()
print("Reading features_doc2vec_sense2vec_pmi_20170418 set") all_df = pd.read_csv(myFeatureSetFileReference, low_memory=True) print("Completed: Reading features_doc2vec_sense2vec_pmi_20170418 set") feature_train_df = all_df[:74067] # feature_train_df.drop('doc2vec_search_term_vector', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_title_vector', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_brand_vector', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_description_vector', axis=1, inplace=True) # feature_train_df.drop('doc2vec_attr_json_vector', axis=1, inplace=True) # feature_train_df.drop('doc2vec_Word2VecQueryExpansion_vector', axis=1, inplace=True) feature_train_df.drop('wm_product_brand', axis=1, inplace=True) feature_test_df = all_df[74067:] feature_test_df.drop('relevance', axis=1, inplace=True) utility.checkpointTimeTrack() #Featuers to play with. # feature_train_df.drop('tfidf_product_title', axis=1, inplace=True) # feature_train_df.drop('tfidf_product_brand', axis=1, inplace=True) # feature_train_df.drop('tfidf_product_description', axis=1, inplace=True) # feature_train_df.drop('tfidf_attr_json', axis=1, inplace=True) # feature_train_df.drop('tfidf_expanded_product_title', axis=1, inplace=True) # feature_train_df.drop('tfidf_expanded_product_brand', axis=1, inplace=True) # feature_train_df.drop('tfidf_expanded_product_description', axis=1, inplace=True) # feature_train_df.drop('tfidf_expanded_attr_json', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_title', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_brand', axis=1, inplace=True) # feature_train_df.drop('doc2vec_product_description', axis=1, inplace=True) # feature_train_df.drop('doc2vec_attr_json', axis=1, inplace=True) # feature_train_df.drop('doc2vec_expanded_product_title', axis=1, inplace=True) # feature_train_df.drop('doc2vec_expanded_product_brand', axis=1, inplace=True)
def getFeature(self, train_query_df, product_df, attribute_df, test_query_df, features="brand,attribute,spelling,nonascii,stopwords,colorExist,color_onehot,brandExist,wmdistance,stemming,word2vec,Word2VecQueryExpansion,tfidf,tfidf_expandedquery,doc2vec,doc2vec_expandedquery,bm25,bm25expandedquery,doclength"): ## Please feel free to add feature into this method. ## For testing, you may want to comment out some feature generation to save time ## as some takes a long time to run. timetracker=Utility() if features.find("brand") != -1: # Create Brand Column product_df = self.__createBrandColumn(product_df, attribute_df) if features.find("attribute") != -1: # Create Attribute column as a JSON string # Column name is attr_json product_df = self.__createAttributeColumn(product_df, attribute_df) if features.find("spelling") != -1: # Perform spell correction on search_term print("Performing spell correction") spell_dict = Feature_Spelling.getSpellingCorrectionDict() # print(self.__spell_correction('lifeswivel', spell_dict)) train_query_df['search_term'] = train_query_df['search_term'].map( lambda x: self.__spell_correction(x, spell_dict)) product_df['product_description'] = product_df['product_description'].map( lambda x: self.__spell_correction(x, spell_dict)) product_df['product_title'] = product_df['product_title'].map( lambda x: self.__spell_correction(x, spell_dict)) product_df['attr_json'] = product_df['attr_json'].map( lambda x: self.__spell_correction(str(x), spell_dict)) if features.find("nonascii") != -1: # Remove non-ascii characters print("Performing non-ascii removal") start_time = time.time() train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__nonascii_clean((x))) print("Non-ascii clean on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_title'] = product_df['product_title'].map(lambda x: self.__nonascii_clean(str(x))) print("Non-ascii clean on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2)) # Run this to download the download the stopword list if you hit error # nltk.download() if features.find("stopwords") != -1: # Stopwords removal print("Performing stopwords removal") start_time = time.time() train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stopword_removal((x))) print("stopwords removal on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stopword_removal(str(x))) print("stopwords removal on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stopword_removal(str(x))) print("stopwords removal on product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stopword_removal(str(x))) print("stopwords removal on attr_jason took: %s minutes" % round(((time.time() - start_time) / 60), 2)) if features.find("colorExist") != -1: # Check if color in search_term exist in product_description column print("Performing color and material check") start_time = time.time() color = Feature_ColorMaterial() train_query_df['color'] = color.checkColorMaterialExists(train_query_df, product_df) train_query_df['color_exist'] = train_query_df['color'].map(lambda x: 1 if len(x)>0 else 0) # Save some memory. Change it to uint8 train_query_df.color_exist = train_query_df.color_exist.astype(np.uint8) if features.find("color_onehot") != -1: train_query_df = self.__onehot_color(train_query_df) # Clean up unused column train_query_df.pop('color') print("Color and material check took: %s minutes" % round(((time.time() - start_time) / 60), 2)) if features.find("brandExist") != -1: # Check if brand in search term exist product_brand column print("Performing brand check") start_time = time.time() train_query_df['brand_exist'] = self.__brandExist(train_query_df, product_df) # train_query_df['brand_exist'] = train_query_df['search_term'].map(lambda x: 1 if len(x)>0 else 0) print("Brand check took: %s minutes" % round(((time.time() - start_time) / 60), 2)) if features.find('wmdistance') != -1: print("Performing Word Mover Distance") start_time = time.time() wm = Feature_WordMoverDistance() train_query_df['wm_product_description'] = wm.getDistance(train_query_df, 'search_term', product_df, 'product_description') print("WMDistance for product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2)) train_query_df['wm_product_title'] = wm.getDistance(train_query_df, 'search_term', product_df, 'product_title') print("WMDistance for product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2)) train_query_df['wm_product_brand'] = wm.getDistance(train_query_df, 'search_term', product_df, 'product_brand') print("WMDistance for product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2)) train_query_df['wm_attr_json'] = wm.getDistance(train_query_df, 'search_term', product_df, 'attr_json') print("WMDistance for attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2)) if features.find("stemming") != -1: # # Stemming print("Performing Stemming") start_time = time.time() train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stemming((x))) print("Stemming search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stemming(str(x))) print("Stemming product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_brand'] = product_df['product_brand'].map(lambda x: self.__stemming(str(x))) print("Stemming product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stemming(str(x))) print("Stemming product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2)) product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stemming(str(x))) print("Stemming attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2)) if features.find("word2vec") != -1: # Word2Vec print("===========Performing word2vec computation....this may take a while") timetracker.startTimeTrack() print("Merging product_title and description") print(list(product_df)) product_df['content'] = product_df['product_title'].map(str) + " " + \ product_df['product_description'].map(str) + " " + \ product_df['product_brand'].map(str) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() w2v = Feature_Word2Vec.Feature_Word2Vec() print("Convert DF into sentences for word2vec processing") sentences = w2v.convertDFIntoSentences(product_df, 'content') timetracker.checkpointTimeTrack() print("Training word2vec") w2v.trainModel(sentences) timetracker.checkpointTimeTrack() print("Validating...this should give some results like sofa") print(w2v.getVectorFromWord('stool')) print(w2v.getSimilarWordVectors('stool', 5)) print("===========Completed word2vec computation") ##WARNING: This has to be before bm25expandedquery function call if features.find("Word2VecQueryExpansion") != -1: # Word2VecQueryExpansion print("===========Performing Word2VecQueryExpansion computation....this may take a super long time") timetracker.startTimeTrack() # print("Merging product_title and description") # print(list(product_df)) # product_df['content']=product_df['product_title'].map(str) +" "+ \ # product_df['product_description'].map(str) + " " + \ # product_df['product_brand'].map(str) # product_df.head(1) print("Compute Word2VecQueryExpansion") w2cExpand = Word2VecQueryExpansion() timetracker.checkpointTimeTrack() # print("Remove merged column") # product_df=product_df.drop('content', axis=1) # For every training query-document pair, generate bm25 print("Generate Word2VecQueryExpansion column") train_query_df = w2cExpand.computeExpandedQueryColumn(trainset=train_query_df, colName='Word2VecQueryExpansion') timetracker.checkpointTimeTrack() print("train_query_df:", list(train_query_df)) print("train_query_df head:", train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_Word2VecQueryExpansion.csv') timetracker.checkpointTimeTrack() print("===========Completed Word2VecQueryExpansion computation") if features.find("tfidf") != -1: # TF-IDF print("Performing TF-IDF") tfidf = Feature_TFIDF() train_query_df['tfidf_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_title') train_query_df['tfidf_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_brand') train_query_df['tfidf_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_description') train_query_df['tfidf_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df, 'attr_json') if features.find("tfidf_expandedquery") != -1: # TF-IDF on expanded query print("Performing TF-IDF with expanded query") tfidf = Feature_TFIDF() train_query_df['tfidf_expanded_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_title') train_query_df['tfidf_expanded_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_brand') train_query_df['tfidf_expanded_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_description') train_query_df['tfidf_expanded_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'attr_json') if features.find("doc2vec") != -1: # Doc2Vec print("Performing Doc2Vec") doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_product_title'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_title') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_product_brand'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_brand') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_product_description'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df, 'product_description') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_attr_json'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df, 'attr_json') if features.find("doc2vec_expandedquery") != -1: # Doc2Vec print("Performing Doc2Vec with expanded query") doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_expanded_product_title'] = doc2vec.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_title') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_expanded_product_brand'] = doc2vec.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_brand') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_expanded_product_description'] = doc2vec.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'product_description') doc2vec = Feature_Doc2Vec() train_query_df['doc2vec_expanded_attr_json'] = doc2vec.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df, 'attr_json') if features.find("bm25") != -1: # BM25 print("===========Performing BM25 computation....this may take a while") timetracker.startTimeTrack() print("Merging product_title and description") print(list(product_df)) product_df['content']=product_df['product_title'].map(str) +" "+ \ product_df['product_description'].map(str) + " " + \ product_df['product_brand'].map(str) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df=product_df.reset_index(drop=True) counter=0 for index,product in product_df.iterrows(): # print("product:", product) productId=product['product_uid'] # print("productId:",productId) df=train_query_df[train_query_df.product_uid==productId] # print("df:",df) searchterms="" for index,row in df.iterrows(): searchterm=row['search_term'] searchterms=searchterms+" "+searchterm newString=product_df.iloc[counter]['content']+" "+searchterms product_df.set_value(counter,'content',newString) counter=counter+1 timetracker.checkpointTimeTrack() print("Compute BM25") bm25 = Feature_BM25(product_df) timetracker.checkpointTimeTrack() print("Remove merged column") product_df=product_df.drop('content', axis=1) #For every training query-document pair, generate bm25 print("Generate bm25 column") train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25', searchTermColname='search_term') timetracker.checkpointTimeTrack() print("train_query_df:",list(train_query_df)) print("train_query_df head:",train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_bm25_search_term.csv') timetracker.checkpointTimeTrack() print("===========Completed BM25 computation") if features.find("bm25expandedquery") != -1: if features.find("Word2VecQueryExpansion") != -1: # bm25expandedquery print("===========Performing BM25expanded computation....this may take a while") timetracker.startTimeTrack() print("Merging product_title and description") print(list(product_df)) product_df['content']=product_df['product_title'].map(str) +" "+ \ product_df['product_description'].map(str) + " " + \ product_df['product_brand'].map(str) product_df.head(1) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() print("Compute BM25") bm25 = Feature_BM25(product_df) timetracker.checkpointTimeTrack() print("Remove merged column") product_df=product_df.drop('content', axis=1) #For every training query-document pair, generate bm25 print("Generate bm25 column") train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25expandedquery', searchTermColname='Word2VecQueryExpansion') timetracker.checkpointTimeTrack() print("train_query_df:",list(train_query_df)) print("train_query_df head:",train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv') timetracker.checkpointTimeTrack() print("===========Completed BM25expanded computation") else: print("ERROR: Cannot proceed with bm25expandedquery. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.") if features.find("bm25description") != -1: if features.find("Word2VecQueryExpansion") != -1: # bm25expandedquery print("===========Performing bm25description computation....this may take a while") timetracker.startTimeTrack() print(list(product_df)) # product_df['content']=product_df['product_title'].map(str) +" "+ \ # product_df['product_description'].map(str) + " " + \ # product_df['product_brand'].map(str) product_df['content']=product_df['product_description'].map(str) product_df.head(1) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() print("Compute BM25") bm25 = Feature_BM25(product_df) timetracker.checkpointTimeTrack() print("Remove merged column") product_df=product_df.drop('content', axis=1) #For every training query-document pair, generate bm25 print("Generate bm25 column") train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25description', searchTermColname='Word2VecQueryExpansion') timetracker.checkpointTimeTrack() print("train_query_df:",list(train_query_df)) print("train_query_df head:",train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv') timetracker.checkpointTimeTrack() print("===========Completed bm25description computation") else: print("ERROR: Cannot proceed with bm25description. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.") if features.find("bm25title") != -1: if features.find("Word2VecQueryExpansion") != -1: # bm25expandedquery print("===========Performing bm25title computation....this may take a while") timetracker.startTimeTrack() print(list(product_df)) # product_df['content']=product_df['product_title'].map(str) +" "+ \ # product_df['product_description'].map(str) + " " + \ # product_df['product_brand'].map(str) product_df['content']=product_df['product_title'].map(str) product_df.head(1) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() print("Compute BM25") bm25 = Feature_BM25(product_df) timetracker.checkpointTimeTrack() print("Remove merged column") product_df=product_df.drop('content', axis=1) #For every training query-document pair, generate bm25 print("Generate bm25 column") train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25title', searchTermColname='Word2VecQueryExpansion') timetracker.checkpointTimeTrack() print("train_query_df:",list(train_query_df)) print("train_query_df head:",train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv') timetracker.checkpointTimeTrack() print("===========Completed bm25title computation") else: print("ERROR: Cannot proceed with bm25title. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.") if features.find("bm25brand") != -1: if features.find("Word2VecQueryExpansion") != -1: # bm25expandedquery print("===========Performing bm25brand computation....this may take a while") timetracker.startTimeTrack() print(list(product_df)) # product_df['content']=product_df['product_title'].map(str) +" "+ \ # product_df['product_description'].map(str) + " " + \ # product_df['product_brand'].map(str) product_df['content']=product_df['product_brand'].map(str) product_df.head(1) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() print("Compute BM25") bm25 = Feature_BM25(product_df) timetracker.checkpointTimeTrack() print("Remove merged column") product_df=product_df.drop('content', axis=1) #For every training query-document pair, generate bm25 print("Generate bm25 column") train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25brand', searchTermColname='Word2VecQueryExpansion') timetracker.checkpointTimeTrack() print("train_query_df:",list(train_query_df)) print("train_query_df head:",train_query_df.head(1)) print("Saving to csv") train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv') timetracker.checkpointTimeTrack() print("===========Completed bm25brand computation") else: print("ERROR: Cannot proceed with bm25brand. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.") if features.find("doclength") != -1: # Document Length print("Performing Document Length") product_df['len_product_title'] = product_df['product_title'].map(lambda x: len(homedepotTokeniser(x))) train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_title']], how='left', on='product_uid') product_df['len_product_description'] = product_df['product_description'].map(lambda x: len(homedepotTokeniser(x))) train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_description']], how='left', on='product_uid') product_df['len_brand'] = product_df['product_brand'].map(lambda x: len(homedepotTokeniser(x))) train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_brand']], how='left', on='product_uid') train_query_df['len_search_term'] = train_query_df['search_term'].map(lambda x: len(homedepotTokeniser(x))) if features.find("pmi") != -1: print("===========Performing pmi computation....this may take a while") timetracker.startTimeTrack() print(list(product_df)) product_df['content'] = product_df['product_title'].map(str) + " " + \ product_df['product_description'].map(str) timetracker.checkpointTimeTrack() print("Adding training query for that product id into the content") product_df = product_df.reset_index(drop=True) counter = 0 for index, product in product_df.iterrows(): # print("product:", product) productId = product['product_uid'] # print("productId:",productId) df = train_query_df[train_query_df.product_uid == productId] # print("df:",df) searchterms = "" for index, row in df.iterrows(): searchterm = row['search_term'] searchterms = searchterms + " " + searchterm newString = product_df.iloc[counter]['content'] + " " + searchterms product_df.set_value(counter, 'content', newString) counter = counter + 1 timetracker.checkpointTimeTrack() # Creating content text = product_df['content'].str.cat(sep=' ') pmiFeature = Feature_PMI.Feature_PMI(text) # print("PMI 'kitchen','cabinet': ", pmiFeature.computePMI('kitchen', 'cabinet')) train_query_df = pmiFeature.computePMIColumn(trainset=train_query_df) # print(list(train_query_df), "\n", train_query_df['pmi']) # train_query_df.filter(items=['id', 'pmi']).to_csv('pmi_features.csv') print("train_query_df final column:\n", train_query_df.info()) return train_query_df
def exeFMBidModel(testDF=None, validateDF=None, trainDF=None, trainReader=None, validationReader=None, testReader=None, writeResult2CSV=False): print("============ Factorisation Machine bid model....setting up") timer = Utility() timer.startTimeTrack() print("Getting encoded datasets") trainOneHotData, trainY = trainReader.getOneHotData() validationOneHotData, valY = validationReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist()) testOneHotData, testY = testReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist()) timer.checkpointTimeTrack() print("trainOneHotData:",trainOneHotData.shape,list(trainOneHotData)) print("trainY:", trainY.shape, list(trainY)) print("validationOneHotData:",validationOneHotData.shape,list(validationOneHotData)) print("valY:", valY.shape, list(valY)) fmBidModel=FMBidModel.FMBidModel(cBudget=6250 * 1000, modelType='fmclassificationsgd') print("==========Training starts") # fmBidModel.gridSearchandCrossValidateFastSGD(trainOneHotData, trainY) # timer.checkpointTimeTrack() fmBidModel.trainModel(trainOneHotData,trainY, retrain=True, modelFile="data.pruned/fmclassificationsgd.pkl") timer.checkpointTimeTrack() print("==========Validation starts") predictedProb=fmBidModel.validateModel(validationOneHotData, valY) timer.checkpointTimeTrack() # print("==========Bid optimisation starts") # fmBidModel.optimiseBid(validationOneHotData,valY) # timer.checkpointTimeTrack() # best score 0.3683528286042599 # noBidThreshold 2.833333e-01 # minBid 2.000000e+02 # bidRange 9.000000e+01 # sigmoidDegree - 1.000000e+01 # won 3.432900e+04 # click 1.380000e+02 # spend 2.729869e+06 # trimmed_bids 0.000000e+00 # CTR 4.019925e-03 # CPM 7.952078e+04 # CPC 1.978166e+04 # blended_score 3.683528e-01 # best score 0.3681133881545131 # noBidThreshold 2.833333e-01 # minBid 2.000000e+02 # bidRange 1.000000e+02 # sigmoidDegree - 1.000000e+01 # won 3.449900e+04 # click 1.380000e+02 # spend 2.758561e+06 # trimmed_bids 0.000000e+00 # CTR 4.000116e-03 # CPM 7.996061e+04 # CPC 1.998957e+04 # blended_score 3.681134e-01 # New budget 6250000 # FM # best score 0.32755084132163526 # noBidThreshold 8.666667e-01 # minBid 2.000000e+02 # bidRange 2.500000e+02 # sigmoidDegree - 1.000000e+01 # won 1.461000e+04 # click 1.170000e+02 # spend 1.124960e+06 # trimmed_bids 0.000000e+00 # CTR 8.008214e-03 # CPM 7.699932e+04 # CPC 9.615043e+03 # blended_score 3.275508e-01 # print("==========Getting bids") ## 25000 budget # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.2833333,minBid=200,bidRange=100,sigmoidDegree=-10) ## 6250 budget # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.8666667,minBid=200,bidRange=250,sigmoidDegree=-10) # print("bidIdPriceDF:",bidIdPriceDF.shape, list(bidIdPriceDF)) # bidIdPriceDF.to_csv("mybids.csv") # timer.checkpointTimeTrack() return predictedProb
if __name__ == "__main__": trainset = "data.final/train1_cleaned_prune.csv" validationset = "data.final/validation_cleaned.csv" testset = "data.final/test.csv" print("Reading dataset...") timer = Utility() timer.startTimeTrack() trainReader = ipinyouReader.ipinyouReader(trainset) validationReader = ipinyouReader.ipinyouReader(validationset) testReader = ipinyouReader.ipinyouReader(testset) timer.checkpointTimeTrack() print("Getting encoded datasets") trainOneHotData, trainY = trainReader.getOneHotData() validationOneHotData, valY = validationReader.getOneHotData( train_cols=trainOneHotData.columns.get_values().tolist()) testOneHotData, testY = testReader.getOneHotData( train_cols=trainOneHotData.columns.get_values().tolist()) timer.checkpointTimeTrack() print("trainOneHotData:", trainOneHotData.shape, list(trainOneHotData)) print("trainY:", trainY.shape, list(trainY)) print("validationOneHotData:", validationOneHotData.shape, list(validationOneHotData)) print("valY:", valY.shape, list(valY))