def test_adapt_url(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) self.assertEqual( amazon_url, 'https://www.amazon.com.mx/s?k=audifonos+inalambricos')
def test_there_is_soup(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) amz_soup = extract_soup(amazon_url, 1, just_soup=True) self.assertIsNotNone(amz_soup)
def test_conection_status(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) amz_status = extract_soup(amazon_url, 0, just_status=True) self.assertEqual(amz_status, 200)
def test_get_brute_info_including_Nones(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) amz_soup = extract_soup(amazon_url, 1, just_soup=True) #New test amz_boxes = search_boxes(amz_soup, Amazon.boxes) self.assertEqual(len(amz_boxes), 60)
def test_get_brute_info_without_losses(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) amazon_soup = extract_soup(amazon_url, 1, just_soup=True) amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) #New test amazon_string_stars = get_brute_info(amazon_boxes, Amazon.stars) self.assertEqual(len(amazon_boxes), len(amazon_string_stars))
def test_products_info_getters(self): user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) amazon_soup = extract_soup(amazon_url, 1, just_soup=True) amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_names = len(get_names(amazon_boxes, Amazon.name_and_images)) amazon_images = len(get_images(amazon_boxes, Amazon)) amazon_urls = len(get_products_urls(amazon_boxes, Amazon)) amazon_price = len(get_price(country, amazon_boxes, Amazon.price)) amazon_reviews = len(get_reviews(country, amazon_boxes, Amazon.reviews)) amazon_stars = len(get_stars(country, amazon_boxes, Amazon.stars)) trials = [ amazon_names, amazon_images, amazon_urls, amazon_price, amazon_reviews, amazon_stars ] for test in trials: self.assertEqual(len(amazon_boxes), test)
def scraper(user_request, country): #Adapt the url amazon_url = Amazon.adapt_url(Amazon, user_request, country) #All the HTML of the page amazon_soup = extract_soup(amazon_url, 1, just_soup=True) # #HTML divided by products, and stored as elements of an array amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) # From this part, could get better AFTER the 4 scrapers are made #From the Boxes, obtain the prices amazon_prices = get_price(country, amazon_boxes, Amazon.price) #Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary amazon_cheapest_idx, amazon_cheapest_price = cheapest( amazon_prices, position_and_price=True) cheapest_amazon_product_dictionary = get_cheapest(amazon_cheapest_idx, amazon_boxes, amazon_cheapest_price, country, Amazon) return cheapest_amazon_product_dictionary
#Obtain the cheapest from prices and then, you obtain the cheapest product as a dictionary amazon_cheapest_idx, amazon_cheapest_price = cheapest( amazon_prices, position_and_price=True) cheapest_amazon_product_dictionary = get_cheapest(amazon_cheapest_idx, amazon_boxes, amazon_cheapest_price, country, Amazon) return cheapest_amazon_product_dictionary if __name__ == "__main__": user_request = 'audifonos inalambricos' country = 'mx' amazon_url = Amazon.adapt_url(Amazon, user_request, country) #All the HTML of the page amazon_soup = extract_soup(amazon_url, 1, just_soup=True) #HTML divided by products, and stored as elements of an array amazon_boxes = search_boxes(amazon_soup, Amazon.boxes) amazon_products = {} amazon_products['name'] = get_names(amazon_boxes, Amazon.name_and_images) '''Amazon's images source (link)''' amazon_products['image'] = get_images(amazon_boxes, Amazon) amazon_products['url'] = get_products_urls(amazon_boxes, Amazon) '''Just Amazon's products id. Is used as a url generator: amazon's url + domain + "/dp/" + product_id'''
parser.add_argument('--T', type=int, default=3) # train arguments parser.add_argument('--n_iter', type=int, default=100) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--batch_size', type=int, default=2048) parser.add_argument('--learning_rate', type=float, default=1e-3) parser.add_argument('--l2', type=float, default=0) # model dependent arguments parser.add_argument('--d', type=int, default=50) config = parser.parse_args() from data import Amazon data_set = Amazon.Beauty() # Books, CDs, LastFM train_set, test_set, num_users, num_items, kg_map = data_set.generate_dataset( index_shift=1) maxlen = 0 for inter in train_set: if len(inter) > maxlen: maxlen = len(inter) train = Interactions(train_set, num_users, num_items) train.to_newsequence(config.L, config.T) logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logger.info(config) train_kerl(train, test_set, config, kg_map)
parser.add_argument('--model', type=str, default='xHAM') parser.add_argument('--d', type=int, default=50) config = parser.parse_args() #the code below is used to specify the directories to store the results #resultsName = 'all_results' #logName = resultsName+'/'+config.model+'/'+config.setting+'/'+config.data+'/'+config.data+'_'+str(config.d)+'_'+str(config.L)+'_'+str(config.T)+'_'+str(config.P)+'_'+str(config.l2)+'_'+str(config.order)+'_'+config.abla+'.'+config.setting ##logging.basicConfig(filename=logName, level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) if config.data == 'CDs': from data import Amazon data_set = Amazon.CDs() elif config.data == 'Books': from data import Amazon data_set = Amazon.Books() elif config.data == 'Children': from data import GoodReads data_set = GoodReads.Children() elif config.data == 'Comics': from data import GoodReads data_set = GoodReads.Comics() elif config.data == 'ML20M': from data import MovieLens data_set = MovieLens.ML20M() elif config.data == 'ML1M': from data import MovieLens data_set = MovieLens.ML1M()
parser.add_argument('--L', type=int, default=5) parser.add_argument('--T', type=int, default=3) # train arguments parser.add_argument('--n_iter', type=int, default=1000) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--batch_size', type=int, default=4096) parser.add_argument('--learning_rate', type=float, default=1e-3) parser.add_argument('--l2', type=float, default=1e-3) parser.add_argument('--neg_samples', type=int, default=3) parser.add_argument('--sets_of_neg_samples', type=int, default=50) # model dependent arguments parser.add_argument('--d', type=int, default=50) config = parser.parse_args() from data import Amazon data_set = Amazon.MI() #data_set = Amazon.CDs() # MovieLens.ML20M() # Books, CDs, Electronics # item_id=0 for sequence padding train_set, val_set, train_val_set, test_set, num_users, num_items = data_set.generate_dataset( index_shift=1) train = Interactions(train_val_set, num_users, num_items) train.to_sequence(config.L, config.T) logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logger.info(config) train_model(train, test_set, config) np.save('NDCG_10_B.npy', NDCGS)
# data arguments parser.add_argument('--L', type=int, default=5) parser.add_argument('--T', type=int, default=3) # train arguments parser.add_argument('--n_iter', type=int, default=200) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--batch_size', type=int, default=4096) parser.add_argument('--learning_rate', type=float, default=1e-3) parser.add_argument('--l2', type=float, default=1e-3) parser.add_argument('--neg_samples', type=int, default=3) parser.add_argument('--sets_of_neg_samples', type=int, default=50) # model dependent arguments parser.add_argument('--d', type=int, default=50) config = parser.parse_args() from data import Amazon, MovieLens data_set = Amazon.CDs() # MovieLens.ML20M() # Books, CDs, Electronics # item_id=0 for sequence padding train_set, val_set, train_val_set, test_set, num_users, num_items = data_set.generate_dataset( index_shift=1) train = Interactions(train_val_set, num_users, num_items) train.to_sequence(config.L, config.T) logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logger.info(config) train_model(train, test_set, config)
# data arguments parser.add_argument('--L', type=int, default=5) parser.add_argument('--T', type=int, default=3) # train arguments parser.add_argument('--n_iter', type=int, default=200) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--batch_size', type=int, default=4096) parser.add_argument('--learning_rate', type=float, default=1e-3) parser.add_argument('--l2', type=float, default=1e-3) parser.add_argument('--neg_samples', type=int, default=3) parser.add_argument('--sets_of_neg_samples', type=int, default=50) # model dependent arguments parser.add_argument('--d', type=int, default=50) config = parser.parse_args() from data import Amazon data_set = Amazon.Books() #data_set = Amazon.CDs() # MovieLens.ML20M() # Books, CDs, Electronics # item_id=0 for sequence padding train_set, val_set, train_val_set, test_set, num_users, num_items = data_set.generate_dataset( index_shift=1) train = Interactions(train_val_set, num_users, num_items) train.to_sequence(config.L, config.T) logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logger.info(config) train_model(train, test_set, config)