def run(self): """ array[] MPEP = most popular each product |-------------------------------------------------- | | | search between products and | | Recommend most popular product | | | | | |-------------------------------------------------- | | | 1 - create Recommender folder | | | | 2 - get extention of prodect brand file | | | | 3 - for each brand : | | | | 3.1 - get first product of each brand | | and append into most popular array | | | | 4 - sort MPEP array | | | | 5 - write MPEP into txt file and return | | | |-------------------------------------------------- """ # --- 1 file = File() file.createFolder(self.__recommenderDirectory) # --- 2 if(file.isExistDirectory(self.__productDirectory)): brandsCrawedNames = file.getFileNameInDir(self.__productDirectory , 'txt') MPEP = [] count = 0 productCount = 0 # --- 3 for brand in brandsCrawedNames: # --- 3.1 brandName = brand.replace('.txt' , '') products = file.read_file(self.__productDirectory+'/'+brand) brandcount = len(products) productCount = productCount + brandcount ; product = products[0].replace('\n' , '').replace('[' , '').replace(']' , '').replace("'", '').replace('"' , '') x = product.split(",") x.append(brandName) MPEP.append(x) count += 1 # --- 4 for index, p in enumerate(MPEP): MPEP[index][1] = int(MPEP[index][1]) MPEP = self.sortListByIndex(MPEP, 1, True) for x in MPEP: print(x) # --- 5 file.addListToFile(self.__projectName+'/recommenders/'+'TopProduct' + ".txt", MPEP) print("--" * 100) print("Number of Product : " + str(productCount)) else: return print(self.__productDirectory + '/' + 'is Not Exist ')
class Crawler: def __init__(self): self.file = File() def run(self, projectName): """ |------------------------------------------------------------- | | | | | create Crawled.txt for Processed links | | read brands(links) from queue | | for each brand(link) : | | get pages of brand for crawl | | for each page : | | get productsInfo and | | merge info | | | | | | sort array of products and | | create a file for store data and | | write into it | | remove brand(link) from queue.txt | | add brand(link) into crawled.txt | | | | | |-------------------------------------------------------------- | | | | | 1 - create crawled.txt and read queue.txt | | | | 2 - for each brand(links): | | | | 2.1 - Modify the data for more readability | | | | 2.2 - Get the number of product pages | | | | 2.3 - New Brand object and array for store | | all data of each brand | | | | | | 2.4 - create array for store | | all data of each brand | | | | | | 2.5 - for each page : | | | | | | 2.5.1 - create page url , | | send request to it , | | beautify HTML code | | | | | | 2.5.2 - parse HTML code and | | get Product Info | | | | | | 2.5.3 - integrate parsed data | | | | End for | | | | | | 2.6 - sort final Array ORDER BY DESC and | | create txt file for | | | | | | 2.7 - store array into it's file and | | remove brand(link) from queue | | | | | | 2.8 - add brand(link) into crawled.txt | | | | | | End for | | | | | | | |-------------------------------------------------------------- | | | :param projectName | | | -------------------------------------------------------------- """ # --- 1 self.file.createCrawledFile(projectName) Links = self.file.read_file(projectName + '/queue.txt') # --- 2 for link in Links: # --- 2.1 brandName, LinkUrl = link.split(',') LinkUrl = LinkUrl.replace('\n', '') # --- 2.2 pageNumber = self.getPageNumber(LinkUrl) # --- 2.3 brand = Brand(brandName, LinkUrl, pageNumber) # --- 2.4 # array for store all data of eacch brand arrayOfproducts = [] # --- 2.5 for page in range(1, int(pageNumber) + 1): # --- 2.5.1 url = brand.getLink() + '?page=' + str(page) response = self.request(url) beautifyResponse = self.beautifyHTML(response.text) # --- 2.5.2 profuctInfo = self.getProductsInfo( self.getPageProducts(beautifyResponse)) products_rate = profuctInfo[0] products_names = profuctInfo[1] products_href = profuctInfo[2] # --- 2.5.3 for index, item in enumerate(products_names): integeretedData = [ products_names[index], products_rate[index], products_href[index] ] arrayOfproducts.append(integeretedData) # --- 2.6 print(arrayOfproducts) sortedArray = self.sortListByIndex(arrayOfproducts, 1, True) self.file.createFolder(projectName + '/products') # --- 2.7 self.file.addListToFile( projectName + "/products/" + str(brand.getName()) + ".txt", sortedArray) self.file.removeLineFromFile(projectName + "/queue.txt", link) print(link) # --- 2.8 self.file.addLineToFile(projectName + "/crawled.txt", link) def getPageNumber(self, brandUrl): """ |-------------------------------------------------- | | | get bace brandUrl and | | crawl number of pages | | | |-------------------------------------------------- | | | 1 - send request to baceURL of brand | | and beautify HTML code | | | | 2 - use bs4.select_one for select and parse | | number of div of paginationbox | | | | 3 - check number of pages and return | | | |-------------------------------------------------- | | | :param baceBrandUrl Link | | | |-------------------------------------------------- """ # --- 1 response = self.request(brandUrl) beautifyResponse = self.beautifyHTML(response.text) # --- 2 pageNumber = beautifyResponse.select_one( 'div.paginationbox').text.split()[-2] # --- 3 pageNumber_isDigit = True try: num = int(pageNumber) except ValueError: pageNumber_isDigit = False if (pageNumber_isDigit == True): pass else: pageNumber = 1 return pageNumber def beautifyHTML(self, response): """ |-------------------------------------------------- | | | use BeautifulSoup library | | receive a HTML dirty code | | beatify dirty HTML code | | | |-------------------------------------------------- | | | 1 - BeautifulSoup() => get 2 parameter | | 1 - dirty HTML code | | 2 - type of parse e.g : | | html.parser | | xml.parser | | | |-------------------------------------------------- | | | :param response => ( HTML code ) | | | |-------------------------------------------------- | | | return beatify of dirty HTML code | | | |-------------------------------------------------- """ # --- 1 return BeautifulSoup(response, 'html.parser') def request(self, url): """ |-------------------------------------------------- | | | use requests library | | send request to :url | | | |-------------------------------------------------- | | | 1 - get(url) => send request to url | | | |-------------------------------------------------- | | | :param URL of a web page | | | |-------------------------------------------------- | | | return HTML of Web page | | | |-------------------------------------------------- """ # --- 1 return requests.get(url) def sortListByIndex(self, list, index, reverse=True): """ |-------------------------------------------------- | | | get a list and a index of list | | then sort it | | | |-------------------------------------------------- | | | 1 - sort list by index | | | |-------------------------------------------------- | | | :param List(array) | | | | :param index of list(array) | | | | :param reverse ? true : false | | | |-------------------------------------------------- | | | return a sorted list | | | |-------------------------------------------------- """ return sorted(list, key=operator.itemgetter(index), reverse=reverse) def getPageProducts(self, butifyResponse): """ |-------------------------------------------------- | | | get div of phons and parse | | | |-------------------------------------------------- | | | 1 - sort list by index | | | |-------------------------------------------------- | | | return a parsed html code | | | |-------------------------------------------------- """ # --- 1 return butifyResponse.select('div.phonesgrid div') def getProductsInfo(self, productsOfPage): """ |-------------------------------------------------- | | | get a beautiful code of div of all product | | separate rate name and href | | and return | | | |-------------------------------------------------- | | | 1 - for each product : | | | | 1.2 - get product rate | | | | 1.3 - get product name | | | | 1.4 - get product hrep | | | | | |-------------------------------------------------- | | | return 3 array of informations of product | | | |-------------------------------------------------- """ products_rate = [] products_names = [] products_href = [] # get each product info # --- 1 pre = '' for product in productsOfPage: product_rate = self.getProductRate(product) product_name = self.getProductName(product) #product_detail_link = product.select('div span')[1]['data-tipurl'] # --- 1.2 for r in product_rate: title = r['title'] rate = re.findall(r'[0-9]+', title) #print(rate) final_rate = (int(rate[0]) * int(rate[2])) / 100 final_rate = round(final_rate) products_rate.append(final_rate) # --- 1.3 # --- 1.4 for i in product_name: name = i.text if (name != pre): pre = name products_names.append(name) products_href.append('https://www.mobile.ir' + i['href']) return products_rate, products_names, products_href def getProductName(self, product): """ |-------------------------------------------------- | | | from div of each product separate | | a tag | | | |-------------------------------------------------- | | | 1 - user BeautifulSoup.select for | | select one element by id or class | | | |-------------------------------------------------- | | | return a tag element | | | |-------------------------------------------------- """ return product.select('div.info h4 a') def getProductRate(self, product): """ |-------------------------------------------------- | | | from div of each product separate | | div tag | | | |-------------------------------------------------- | | | 1 - user BeautifulSoup.select for | | select one element by id or class | | | |-------------------------------------------------- | | | return a div tag element | | | |-------------------------------------------------- """ return product.select('div.ratestars')