def main(): date = time.strftime("%d%m%Y") link = "http://www.flipkart.com/" page = urll_proxy.main(link) html = page.read() soup = BeautifulSoup(html) page.close() tag_menu = soup.find_all("li", attrs={"class":"menu-l0"}) dict_menu_links = {} for l in tag_menu: menu = l.get("data-key") tag_menu_item = l.find_all("li", attrs={"class":"menu-item"}) dict_menu_links[menu] = [] for l2 in tag_menu_item: try: l2 = "http://www.flipkart.com" + l2.a.get("href") dict_menu_links[menu].append(l2) except: pass return dict_menu_links
def main(): link = "http://www.flipkart.com/" page = urll_proxy.main(link) soup = BeautifulSoup(page.read()) page.close() tag_menu = soup.find_all("li", attrs={"class":"menu-l0"}) dict_menu_link = {} for l in tag_menu: menu = str(l.get("data-key")).strip() tag_menu_item = l.find_all("li", attrs={"class":"menu-item"}) dict_menu_link[menu] = [] for l in tag_menu_item: try: full_link = "http://www.flipkart.com" + l.a.get("href") dict_menu_link[menu].append(full_link) except: pass f = open("code1_dict_menu_link.txt", "w+") print >>f, dict_menu_link f.close()
def main4(todatdir_menu, sub_link): page = urll_proxy.main(str(sub_link).strip()) html = page.read() soup = BeautifulSoup(str(html)) print page.geturl() page.close() link_split = sub_link.strip().split("/") #print menu + "__" + "__".join(link_split[3:-1]) #shutil.rmtree(todatdir_menu) if re.search(r'[\w]*~brand', link_split[-2]): filename = todatdir_menu + "/sublink_alreadybrand.txt" f = open(filename, "a+") print >> f, sub_link f.close() elif soup.find_all("ul", attrs={"id": "brand"}): filename = todatdir_menu + "/sublink_brand_to_extract.txt" f = open(filename, "a+") print >> f, sub_link f.close() else: print "pass from main4"
def main4(todatdir_menu, sub_link): page = urll_proxy.main(str(sub_link).strip()) html = page.read() soup = BeautifulSoup(str(html)) print page.geturl() page.close() link_split = sub_link.strip().split("/") #print menu + "__" + "__".join(link_split[3:-1]) #shutil.rmtree(todatdir_menu) if re.search(r'[\w]*~brand', link_split[-2]): filename = todatdir_menu + "/sublink_alreadybrand.txt" f = open(filename, "a+") print >>f , sub_link f.close() elif soup.find_all("ul", attrs={"id":"brand"}): filename = todatdir_menu + "/sublink_brand_to_extract.txt" f = open(filename, "a+") print >>f , sub_link f.close() else: print "pass from main4"
def main(): link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?sid=reh%2Cihu%2Cm08" page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_ul = soup.find_all("ul", attrs={"id": "brand"}) f = open("page1_bn_bl_bc.csv", "a+") f2 = open("page1_brandname", "a+") tag_a = tag_ul[0].find_all("a") pos = 1 for l in tag_a: brand_link = "http://www.flipkart.com" + str(l.get("href")).strip() brand_name = str(l.span.get_text()).strip() brand_count = str(l.find("span", attrs={ "class": "count" }).get_text()).strip("()") date = str(time.strftime("%d/%m/%Y")) print >> f, ','.join( [date, str(pos), brand_name, brand_count, brand_link]) print >> f2, brand_name pos = pos + 1 f.close() f2.close() shutil.copy2("page1_bn_bl_bc.csv", "page1_bn_bl_bc_new.csv") os.remove("page1_bn_bl_bc.csv")
def main3(i, q): while True: dirtwo, l = q.get() dirtwo = dirtwo + "/" + l.split("/")[3] if not os.path.exists(dirtwo): os.makedirs(dirtwo) filename = dirtwo + "/" + "-xx-".join(l.split("/")[3:-1]) + "-xx-bnbcbl.csv" f = open(filename, "a+") page = urll_proxy.main(l) soup = BeautifulSoup(page) page.close() tag_ul = soup.find("ul", attrs={"id":"brand"}) tag_a = tag_ul.find_all("a") pos = 1 for l in tag_a: if l.get("href"): brand_link = "http://www.flipkart.com"+str(l.get("href")).strip() brand_name = str(l.span.get_text()).strip() brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()") #print brand_link, brand_name, brand_count date = str(time.strftime("%d:%m:%Y")) print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link]) pos = pos+1
def main3(i, q): while True: directory, link = q.get() page = urll_proxy.main(link) html = page.read() soup = BeautifulSoup(html) page.close() if soup.find("ul", attrs={"id": "brand"}): filename = directory + "/extract_brand_from_it.txt" f = open(filename, "a+") print >> f, link f.close() link_split = link.split("/")[-2] if re.search(".*~brand", link.split("/")[-2]): filename = directory + "/its_already_brand_.txt" f = open(filename, "a+") print >> f, link f.close() tag_nav = soup.find("div", attrs={"class": "nav-section-cat-list"}) if tag_nav: for l in tag_nav: try: sub_link = "http://www.flipkart.com" + l.get("href") t = threading.Thread(target=main4, args=(directory, sub_link)) t.start() except: pass
def main(): link = "http://www.flipkart.com/" page = urll_proxy.main(link) soup = BeautifulSoup(page.read()) page.close() tag_menu = soup.find_all("li", attrs={"class": "menu-l0"}) dict_menu_link = {} for l in tag_menu: menu = str(l.get("data-key")).strip() tag_menu_item = l.find_all("li", attrs={"class": "menu-item"}) dict_menu_link[menu] = [] for l in tag_menu_item: try: full_link = "http://www.flipkart.com" + l.a.get("href") dict_menu_link[menu].append(full_link) except: pass f = open("code1_dict_menu_link.txt", "w+") print >> f, dict_menu_link f.close()
def main3(i, q): while True: directory, link = q.get() page = urll_proxy.main(link) html = page.read() soup = BeautifulSoup(html) page.close() if soup.find("ul", attrs={"id":"brand"}): filename = directory + "/extract_brand_from_it.txt" f = open(filename, "a+") print >>f, link f.close() link_split = link.split("/")[-2] if re.search(".*~brand", link.split("/")[-2]): filename = directory + "/its_already_brand_.txt" f = open(filename, "a+") print >>f, link f.close() tag_nav = soup.find("div", attrs={"class":"nav-section-cat-list"}) if tag_nav: for l in tag_nav: try: sub_link = "http://www.flipkart.com" + l.get("href") t = threading.Thread(target=main4, args=(directory, sub_link)) t.start() except: pass
def main(): link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?sid=reh%2Cihu%2Cm08" page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_ul = soup.find_all("ul", attrs={"id":"brand"}) f = open("page1_bn_bl_bc.csv","a+") f2 = open("page1_brandname", "a+") tag_a = tag_ul[0].find_all("a") pos = 1 for l in tag_a: brand_link = "http://www.flipkart.com"+str(l.get("href")).strip() brand_name = str(l.span.get_text()).strip() brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()") date = str(time.strftime("%d/%m/%Y")) print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link]) print >>f2, brand_name pos = pos+1 f.close() f2.close() shutil.copy2("page1_bn_bl_bc.csv", "page1_bn_bl_bc_new.csv") os.remove("page1_bn_bl_bc.csv")
def main3(i, q): while True: fpth, l = q.get() fpath_list = fpth.split("/") l_list = l.split("/") page = urll_proxy.main(l) html = page.read() soup = BeautifulSoup(html) tag_ul = soup.find("ul", attrs={"id": "brand"}) try: tag_a = tag_ul.find_all("a") for l2 in tag_a: try: if l2.get("href"): brdl = "http://www.flipkart.com" + l2.get("href") brdn = l2.span.get_text() brdc = l2.find("span", attrs={ "class": "count" }).get_text() fdir = '/'.join(fpath_list[:-1]) + "/" + l_list[3] if not os.path.exists(fdir): os.makedirs(fdir) fhomepage = fdir + "/" + "8".join( l_list[3:-1]) + "8.csv" f = open(fhomepage, "a+") date = time.strftime("%H:%M:%S") print >> f, ','.join([date, brdn, brdc, brdl]) f.close() f2 = open(fpath_list[0] + "/availtoscroll", "a+") print >> f2, fdir f2.close() else: pass except: pass except: pass
def main2(i, q): while True: todaydir, menu, l = q.get() page = urll_proxy.main(str(l).strip()) html = page.read() soup = BeautifulSoup(str(html)) print page.geturl() page.close() link_split = l.strip().split("/") #print menu + "__" + "__".join(link_split[3:-1]) todatdir_menu = todaydir + "/" + menu #shutil.rmtree(todatdir_menu) if not os.path.exists(todatdir_menu): os.makedirs(todatdir_menu) if re.search(r'[\w]*~brand', link_split[-2]): filename = todatdir_menu + "/alreadybrand.txt" f = open(filename, "a+") print >> f, l f.close() elif soup.find_all("ul", attrs={"id": "brand"}): filename = todatdir_menu + "/brand_to_extract.txt" f = open(filename, "a+") print >> f, l f.close() elif soup.find_all("div", attrs={"class": "nav-section-cat-list"}): tag_nav = soup.find_all("div", attrs={"class": "nav-section-cat-list"}) main3(todatdir_menu, tag_nav[0]) else: print "pass" time.sleep(2) q.task_done()
def main2(i, q): while True: todaydir, menu, l = q.get() page = urll_proxy.main(str(l).strip()) html = page.read() soup = BeautifulSoup(str(html)) print page.geturl() page.close() link_split = l.strip().split("/") #print menu + "__" + "__".join(link_split[3:-1]) todatdir_menu = todaydir + "/" + menu #shutil.rmtree(todatdir_menu) if not os.path.exists(todatdir_menu): os.makedirs(todatdir_menu) if re.search(r'[\w]*~brand', link_split[-2]): filename = todatdir_menu + "/alreadybrand.txt" f = open(filename, "a+") print >>f , l f.close() elif soup.find_all("ul", attrs={"id":"brand"}): filename = todatdir_menu + "/brand_to_extract.txt" f = open(filename, "a+") print >>f , l f.close() elif soup.find_all("div", attrs={"class":"nav-section-cat-list"}): tag_nav = soup.find_all("div", attrs={"class":"nav-section-cat-list"}) main3(todatdir_menu, tag_nav[0]) else: print "pass" time.sleep(2) q.task_done()
def main3(i, q): while True: fpth, l = q.get() fpath_list = fpth.split("/") l_list = l.split("/") page = urll_proxy.main(l) html = page.read() soup = BeautifulSoup(html) tag_ul = soup.find("ul", attrs={"id":"brand"}) try: tag_a = tag_ul.find_all("a") for l2 in tag_a: try: if l2.get("href"): brdl = "http://www.flipkart.com" + l2.get("href") brdn = l2.span.get_text() brdc = l2.find("span", attrs={"class":"count"}).get_text() fdir = '/'.join(fpath_list[:-1]) + "/" + l_list[3] if not os.path.exists(fdir): os.makedirs(fdir) fhomepage = fdir + "/" + "8".join(l_list[3:-1]) + "8.csv" f = open(fhomepage, "a+") date = time.strftime("%H:%M:%S") print >>f, ','.join([date, brdn, brdc, brdl]) f.close() f2 = open(fpath_list[0] + "/availtoscroll", "a+") print >>f2, fdir f2.close() else: pass except: pass except: pass
def main4(directory, sub_link): page = urll_proxy.main(sub_link) html = page.read() soup = BeautifulSoup(html) page.close() link_split = sub_link.split("/")[-2] if re.search(".*~brand", sub_link.split("/")[-2]): filename = directory + "/sub_link_its_already_brand_.txt" f = open(filename, "a+") print >> f, sub_link f.close() if soup.find("ul", attrs={"id": "brand"}): filename = directory + "/sublink_extract_brand_from_it.txt" f = open(filename, "a+") print >> f, sub_link f.close()
def main4(directory, sub_link): page = urll_proxy.main(sub_link) html = page.read() soup = BeautifulSoup(html) page.close() link_split = sub_link.split("/")[-2] if re.search(".*~brand", sub_link.split("/")[-2]): filename = directory + "/sub_link_its_already_brand_.txt" f = open(filename, "a+") print >>f, sub_link f.close() if soup.find("ul", attrs={"id":"brand"}): filename = directory + "/sublink_extract_brand_from_it.txt" f = open(filename, "a+") print >>f, sub_link f.close()
def main(link): page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_ul = soup.find_all("ul", attrs={"id": "brand"}) cat_name = link.split("/")[-2].strip() currentdir = os.getcwd() currentdate = time.strftime("%d%m%Y") branddir = currentdir + "/brand_info_by_date/" + cat_name + currentdate if not os.path.exists(branddir): subprocess.check_output(['mkdir', '-p', branddir]) fname = branddir + "/" + cat_name f = open(fname + "_bn_bc_bl.csv", "a+") f2 = open(fname + "_brandname_brandlink.csv", "a+") tag_a = tag_ul[0].find_all("a") pos = 1 for l in tag_a: brand_link = "http://www.flipkart.com" + str(l.get("href")).strip() brand_name = str(l.span.get_text()).strip() brand_count = str(l.find("span", attrs={ "class": "count" }).get_text()).strip("()") date = str(time.strftime("%d/%m/%Y")) print >> f, ','.join( [date, str(pos), brand_name, brand_count, brand_link]) print >> f2, ','.join([brand_name, brand_link]) pos = pos + 1 f.close() f2.close() print fname + "_brandname_brandlink.csv"
def main(link): page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_ul = soup.find_all("ul", attrs={"id":"brand"}) cat_name = link.split("/")[-2].strip() currentdir = os.getcwd() currentdate = time.strftime("%d%m%Y") branddir = currentdir+"/brand_info_by_date/"+cat_name+currentdate if not os.path.exists(branddir): subprocess.check_output(['mkdir', '-p', branddir]) fname = branddir+"/"+cat_name f = open(fname+"_bn_bc_bl.csv","a+") f2 = open(fname+"_brandname_brandlink.csv", "a+") tag_a = tag_ul[0].find_all("a") pos = 1 for l in tag_a: brand_link = "http://www.flipkart.com"+str(l.get("href")).strip() brand_name = str(l.span.get_text()).strip() brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()") date = str(time.strftime("%d/%m/%Y")) print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link]) print >>f2, ','.join([brand_name, brand_link]) pos = pos+1 f.close() f2.close() print fname+"_brandname_brandlink.csv"
def main(): directory = "dir%s" % (time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extract.txt", "w+") print >> f, directory f.close() f = open("extracted.txt", "a+") print >> f, directory f.close() link = "http://www.fashionandyou.com/" #page = req_proxy.main(link) page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_menu = soup.find("ul", attrs={"id": "verticalsMenu"}) tag_menu_li = tag_menu.find_all("a", attrs={"class": "vertical-tab"}) menucontainer = [] for al in tag_menu_li: menulink = "%s%s" % ("http://www.fashionandyou.com", str( al.get("href")).strip()) menutitle = str(al.get_text()) menucontainer.append([menulink, menutitle]) main2(menucontainer)
def main(): directory = "dir%s" %(time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extract.txt", "w+") print >>f, directory f.close() f = open("extracted.txt", "a+") print >>f, directory f.close() link = "http://www.fashionandyou.com/" #page = req_proxy.main(link) page = urll_proxy.main(link) soup = BeautifulSoup(page) tag_menu = soup.find("ul", attrs={"id":"verticalsMenu"}) tag_menu_li = tag_menu.find_all("a", attrs={"class":"vertical-tab"}) menucontainer = [] for al in tag_menu_li: menulink = "%s%s" %("http://www.fashionandyou.com", str(al.get("href")).strip()) menutitle = str(al.get_text()) menucontainer.append([menulink, menutitle]) main2(menucontainer)
date = str(time.strftime("%d:%m:%Y")).strip() f = open(filename,"a+") print >>f, ','.join([date, catname, brandname, item_title, item_price, item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)]) f.close() logging.debug([date, catname, brandname, item_title, item_price, item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)]) except: f = open("newerrorfile", "a+") print >>f, l ======= page = urll_proxy.main(l) assert page soup = BeautifulSoup(page) page.close() tag_h1 = soup.find("h1", attrs={"itemprop":"name"}) item_title = str(tag_h1.get_text()).strip() try: tag_colour = soup.find("div", attrs={"class":"line extra_text bmargin10"}) item_clour = str(tag_colour.get_text()).strip() except: item_clour = " No more colour" tag_img = soup.find("img", attrs={"id":"visible-image-small"})