def main(line): line = ast.literal_eval(line) page = req_proxy.main(line[8]) f = open("myfile.txt", "w+") print >>f, page f.close() #driver = phan_proxy.main(line[8]) #page = driver.page_source #driver.delete_all_cookies() #driver.quit() #tree = html.fromstring(page) #meta_disc = tree.xpath("/html/head/meta[3]/text()") soup = BeautifulSoup(page, "html.parser") meta_disc = soup.find("meta", attrs={"name":"description"}).get("content") title = soup.find("title").get_text() desc = soup.find("section", attrs={"id":"product-detail"}) dte = time.strftime("%d:%m:%Y") status = " " spec = " " vender = "zovi.com" brand = "zovi" print map(my_strip, [line[7], line[11], line[0], line[5], line[2], line[3], brand, line[9], line[5], line[4], line[1], line[8], vender, title, meta_disc, line[10], desc, spec, dte, status])
def main2(line, all_brand_link, f3): menu = line[0] ctlink = line[1] cttitle = line[2] sctlink = line[3] scttitle = line[4] page = req_proxy.main(all_brand_link) soup = BeautifulSoup(page) tag_brand_uls = soup.find_all("ul", attrs={"class": "column"}) for ul_brand_link in tag_brand_uls: all_brand_link = ul_brand_link.find_all("a") for bl_bt in all_brand_link: bl = "%s%s" % ("http://www.amazon.in", bl_bt.get("href")) bt = str(bl_bt.span.get_text()).strip() filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt] filedata2 = map(mystrip, filedata) logging.debug(filedata2) f4 = open(f3, "a+") print >> f4, filedata2 f4.close()
def get_product(self, line): """ given a ct sct scl subsubcate subcatelink br and brl find product""" line = ast.literal_eval(line) link = "http://www.junglee.com%s" %(line[-1]) line[-1] = link page = req_proxy.main(link) soup = BeautifulSoup(page, "html.parser") pro_div_list = soup.find_all("div", attrs={"id":re.compile(r"^result_[\d]{1,}$")}) filobj_write = self.pr_pl_filobj.write my_strip = self.my_strip info = [ filobj_write(str( map( my_strip, line + [ pro_div.find("img", attrs={"alt":"Product Details"}).find_parent("a").get("href") ] ) ) + "\n" ) for pro_div in pro_div_list ] del info[:] del info
def main(): dte = "dir%s" %(time.strftime("%d%m%Y")) try: os.makedirs(dte) except: pass link = "http://www.myntra.com/" page = req_proxy.main(link) #page = phan_proxy.main(link) if not page: main() soup = BeautifulSoup(page) tag_mklevel2 = soup.find_all("a", attrs={"class":"mk-level2 "}) #print len(filter(None, map(menucollection, tag_mklevel2))) f = open("to_extract.txt", "w+") f2 = open("extracted.txt", "a+") print >>f, dte print >>f2, dte f.close() f2.close() return filter(None, map(menucollection, tag_mklevel2))
def main2(line, catlink, filename2): menu = line[0].strip() submnlink = line[1].strip() submntitle = line[2].strip() catlink = line[-2].strip() cattitle = line[-1].strip() page = req_proxy.main(catlink) soup = BeautifulSoup(page) tag_brand = soup.find("ul", attrs={"id": "brand"}) tag_brand_a = [] if tag_brand is not None: tag_brand_a = tag_brand.find_all("a") f = open(filename2, "a+") for al in tag_brand_a: brandlink = "%s%s" % ("http://www.flipkart.com", str(al.get("href")).strip()) brandtitle = str(al.get_text()).replace("\n", " ").replace("\t", " ").replace("\r", " ").strip() print >> f, [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle] logging.debug([menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle]) f.close()
def main(): directory = "diramazon%s" %(time.strftime("%d%m%y")) try: os.makedirs(directory) except: pass f = open("to_extract_amazon.txt", "w+") f.write(directory) f.close() f = open("extracted_amazon.txt", "a+") f.write(directory) f.close() filename = "%s/%s" %(directory, "f_mn_ctl_cat.txt") f = open(filename, "a+") link = "http://www.amazon.in/gp/site-directory/ref=sa_menu_top_fullstore" page = req_proxy.main(link) soup2 = BeautifulSoup(page) catlist = ["Beauty & Health", "Watches & Fashion Jewellery", "Handbags & Luggage"] map(functools.partial(main2, soup=soup2, f = f), catlist) f.close()
def main(): link = "http://www.homeshop18.com/all-stores.html" page = req_proxy.main(link) soup = BeautifulSoup(page) tag_border = soup.find_all("div", attrs={"class":"border-box2 mar-top"}) patter_firstpage = [] for brend in tag_border: brend_title = brend.find("div", attrs={"class":"brend-title clearfix"}).get_text() cat = str(brend_title).strip() if (cat == "Books") or (cat == "Jewellery"): pass else: tag_img_holder = brend.find_all("div", attrs={"class":"img-holder"}) for sub_cat in tag_img_holder: sub_cat_link = str(sub_cat.find("a").get("href")).strip() parsed = urlparse(sub_cat_link) sub_cat_title = filter(None, str(parsed.path).split("/")) patter_firstpage.append([cat, sub_cat_link, sub_cat_title[0].strip()]) return patter_firstpage
def main2(mn_sbml_sbml, filename): menu = mn_sbml_sbml[0] submenulink = mn_sbml_sbml[1] submenutitle = mn_sbml_sbml[2] page = req_proxy.main(submenulink) soup = BeautifulSoup(page) tag_cat = soup.find("div", attrs={"class":"nav-section first-section"}) tag_cat_link = [] if tag_cat is not None: tag_cat_link = tag_cat.find_all("a") f = open(filename, "a+") for l in tag_cat_link: cattitle = str(l.get("title")).strip() catlink = "%s%s" %("http://www.flipkart.com", str(l.get("href")).strip()) print >>f, [menu, submenulink, submenutitle, catlink, cattitle] logging.debug([menu, submenulink, submenutitle, catlink, cattitle]) f.close()
def part_second(line, filename2): f = open(filename2, "a+") line2 = line.strip().split(",") menulink = line2[0].strip() menutitle = line2[1].strip() catlink = line2[2].strip() cattitle = line2[3].replace("\n","").strip() subcatlink = line2[-2].strip() subcattitle = line2[-1].strip() page = req_proxy.main(subcatlink) soup = BeautifulSoup(page, "html.parser") tag_brand = soup.find("div", text=re.compile("Brand")) blbtbc_list = tag_brand.parent.parent.find_all("li", attrs={"class":"srch_ctgry_item"}) for blbtbc in blbtbc_list: brandlink = "%s%s" %("http://www.homeshop18.com", str(blbtbc.a.get("href")).strip()) brandtitle = str(blbtbc.a.get( "title")).replace("\n", "").strip() print >>f, ','.join([menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle]) print menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle print "*"*145 f.close()
def main2(line, all_brand_link, f3): menu = line[0] ctlink = line[1] cttitle = line[2] sctlink = line[3] scttitle = line[4] page = req_proxy.main(all_brand_link) soup = BeautifulSoup(page) tag_brand_uls = soup.find_all("ul", attrs={"class":"column"}) for ul_brand_link in tag_brand_uls: all_brand_link = ul_brand_link.find_all("a") for bl_bt in all_brand_link: bl = "%s%s" %("http://www.amazon.in", bl_bt.get("href")) bt = str(bl_bt.span.get_text()).strip() filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt] filedata2 = map(mystrip, filedata) logging.debug(filedata2) #print >>f3, filedata2 f4 = open(f3, "a+") print >>f4, filedata2 f4.close()
def lnk_tl_vl_vl_prl_fun2(self, line, counter, cl = None): if cl is None: cl = line[-2] else: pass page = req_proxy.main(cl) soup = BeautifulSoup(page) pro_div = soup.find("ul", attrs={"id":"listing-product-list"}) pro_link = pro_div.find_all("li", attrs={"class":"listing-unit"}) for pl in pro_link: pl = pl.find("a", attrs={"class":"OneLinkNoTx"}) pl = pl.get("href") print line + [pl] self.f2.write(str(line + [pl]) + "\n") counter += len(pro_link) if counter > 1000: return 0 next_url = pro_div.find("li", attrs={"class":"next"}) try: next_url = "http://download.cnet.com%s" %(next_url.a.get("href")) logging.debug(next_url) self.lnk_tl_vl_vl_prl_fun2(line, counter, next_url) except: pass
def main2(ml_mt_sub, filename): f = open(filename, "a+") menulink = ml_mt_sub[0] menutitle = ml_mt_sub[1] page = req_proxy.main(menulink) soup = BeautifulSoup(page, "html.parser") tag_box = soup.find_all("div", attrs={"class":"head clearfix"}) for al in tag_box: cato = al.find("div") catolink = "%s%s" %("http://www.homeshop18.com", str(cato.a.get("href")).strip()) catotitle = cato.a.get_text() sub_cato = al.find_next_sibling("div") if sub_cato: sub_cato2 = sub_cato.find_all("a") for al in sub_cato2: sub_catolink = "%s%s" %("http://www.homeshop18.com", str(al.get("href")).strip()) sub_catotext = al.get("title") print >>f, ','.join([menulink, menutitle, catolink, catotitle, sub_catolink, sub_catotext]) else: print >>f, ','.join([menulink, menutitle, catolink, catotitle, catolink, catotitle]) f.close()
def main5(i, q): for line, f in iter(q.get, None): link = line[-2].strip() page = req_proxy.main(link) soup = BeautifulSoup(page) tag_brand = soup.find("div", attrs={"id":"facet_brand"}) try: tag_a = tag_brand.find_all("a") except: tag_a = [] for l in tag_a: try: brandlink = str(l.get("href")).strip() bramdname = str(l.get_text()).strip() print >>f, "%s,%s,%s" %(','.join(line), brandlink, bramdname) except: pass time.sleep(i + 2) q.task_done() q.task_done()
def main3(i, q): for link in iter(q.get, None): page = req_proxy.main(link) soup = BeautifulSoup(page) tag_cat = soup.find_all("div", attrs={"class":"search-by-cat mt10 mb10 pl14 "}) if tag_cat: cat_tag_a = tag_cat[0].find_all("a") else: cat_tag_a = [] for cl in cat_tag_a: try: link_cl_ctxt_ccount.append([link, str(cl.get("href")).strip(), str(cl.get_text()).strip()]) logging.debug((link, str(cl.get("href")).strip(), str(cl.get_text()).strip())) except: pass time.sleep(i + 2) q.task_done() q.task_done()
def part_second(line, filename2): f = open(filename2, "a+") line2 = line.strip().split(",") menulink = line2[0].strip() menutitle = line2[1].strip() catlink = line2[2].strip() cattitle = line2[3].replace("\n", "").strip() subcatlink = line2[-2].strip() subcattitle = line2[-1].strip() page = req_proxy.main(subcatlink) soup = BeautifulSoup(page) tag_brand = soup.find("div", text=re.compile("Brand")) blbtbc_list = tag_brand.parent.parent.find_all( "li", attrs={"class": "srch_ctgry_item"}) for blbtbc in blbtbc_list: brandlink = "%s%s" % ("http://www.homeshop18.com", str(blbtbc.a.get("href")).strip()) brandtitle = str(blbtbc.a.get("title")).replace("\n", "").strip() print >> f, ','.join([ menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle ]) print menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle print "*" * 145 f.close()
def main(link): page = req_proxy.main(link) soup = BeautifulSoup(page, "html.parser") all_page_list = soup.find_all("li", attrs={"class":"MyAccountPag"}) threads = [] t = threading.Thread(target=main2, args=(link,)) threads.append(t) t.start() for page_link_tag in all_page_list: page_link = "http://www.yebhi.com%s" %(str(page_link_tag.a.get("href"))) t = threading.Thread(target=main2, args=(page_link,)) threads.append(t) t.start() main_thread = threading.currentThread() for t in threading.enumerate(): if t is main_thread: continue logging.debug('joining %s', t.getName()) t.join()
def main(directory, mainlink): filename = "%s/complete_link_collection.txt" %(directory) f = open(filename, "a+") page = req_proxy.main(mainlink) soup = BeautifulSoup(page, "html.parser") cat_collection_box = soup.find("div", attrs={"class":"brw_bdr"}) link_list = cat_collection_box.find_all("a") for link in link_list: link = link.get("href") parsed = urlparse(link) if len(parsed.netloc) == 0: link = "http://www.snapdeal.com%s" %(link) f.write(str(link) + "\n") #print link f.close() return filename
def main(line, directory): filename = "%s/f_cl_tr_ct_st_scl_bt_bl.txt" %(directory) f = open(filename, "a+") page = req_proxy.main(line[-1]) soup = BeautifulSoup(page, "html.parser") brand_tag_list = soup.find_all("span", attrs={"class":"forspan"}) for brand_tag in brand_tag_list: if str(brand_tag.get_text()).strip() == "Brands": brand_box = brand_tag.find_parent("div", attrs={"class":"divli"}) brand_list = brand_box.find_all("a", attrs={"class":"RefineByLink"}) for brand_tag in brand_list: brand = str(brand_tag.get("relmselect")) brand_link = "http://www.yebhi.com%s" %(str(brand_tag.get("href"))) #f.write(str([catelink, target, cate, sub_cat, sub_cat_link]) + "\n") f.write(str([line[0], line[1], line[2], line[3], line[4], brand, brand_link]) + "\n") logging.debug([line[0], line[1], line[2], line[3], line[4], brand, brand_link]) f.close()
def main(line, directory): direc = "%s/%s/%s/%s" %(directory, line[2], line[3], line[5]) try: os.makedirs(direc) except: pass filename = "%s/%s.csv" %(direc, line[5]) f = open(filename, "a+") page = req_proxy.main(line[-6]) soup = BeautifulSoup(page, "html.parser") title = soup.find("title").get_text() meta_disc = soup.find("meta", attrs={"name":"description"}).get("content") seller = "yebhi.com" item_desc = soup.find("div", attrs={"itemprop":"description"}) dte = time.strftime("%d:%m:%Y") status = " " f.write(",".join(map(my_strip, [line[9], line[7], line[0], line[12], line[2], line[3], line[5], line[10], line[11], '', line[1], line[8], seller, title, meta_disc, line[13], item_desc, '', dte, status] )) + "\n") f.close() logging.debug("inserted ............")
def main(directory, link): page = req_proxy.main(link) #driver = phan_proxy.main(link) #try: # driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click() # logging.debug("clicked..................................................................") #except: # pass #driver = phan_scroller.main(driver) #page = driver.page_source soup = BeautifulSoup(page, "html.parser") target_cat_list = soup.find("div", attrs={"id":"breadCrumbWrapper"}).find_all("span", attrs={"itemprop":"title"}) filename = "%s/%s.doc" %(directory, "_".join(str(target_cat_list[-1].get_text()).split())) f = open(filename, "a+") item_big_box_list = soup.find("div", attrs={"id":re.compile("products-main")}) item_box_list = item_big_box_list.find_all("div", attrs={"class":"product_grid_row"}) for item_box in item_box_list: item_sub_box_list = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout3"}) if len(item_sub_box_list) == 0: item_sub_box_list = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout4"}) for item_sub_box in item_sub_box_list: item_link = item_sub_box.find("a", attrs={"class":"hit-ss-logger somn-track prodLink"}).get("href") parsed = urlparse(item_link) if len(parsed.netloc) == 0: item_link = "http://www.snapdeal.com%s" %(item_link) size = [] size_box_list = item_sub_box.find("div", attrs={"class":"productAttr"}) if size_box_list is not None: size_option_box = size_box_list.find_all("option") for size_option in size_option_box: size.append(size_option.get("value")) if len(size) !=0: size = filter(None, map(my_strip, size)) info = [link, item_link, size] info2 = map(my_strip, info) f.write(str(info2) + "\n") logging.debug("inserted........................................................................") f.close()
def main2(line, filename): catlink = line[0] cattitle = line[1] f = open(filename, "a+") page = req_proxy.main(catlink) soup = BeautifulSoup(page) tag_page = soup.find("div", attrs={"id": "wp_page_numbers"}) tag_page_a_list = [] if tag_page: tag_page_a = tag_page.find_all("a")[:-1] for cat_page in tag_page_a: sub_page_link = str(cat_page.get("href")).strip() sub_page = req_proxy.main(sub_page_link) sub_soup = BeautifulSoup(sub_page) tag_content_a = soup.find_all("h2", attrs={"class": "title"}) for subcatlinktitle in tag_content_a: subcatlink = str(subcatlinktitle.a.get("href")).strip() subcattitle = subcatlinktitle.get_text().encode( "ascii", "ignore") subcattitle = str(subcattitle).strip().replace( "\n", "").replace("\t", "").replace(",", "") print >> f, ",".join( [catlink, cattitle, subcatlink, subcattitle]) else: tag_content_a = soup.find_all("h2", attrs={"class": "title"}) for subcatlinktitle in tag_content_a: subcatlink = str(subcatlinktitle.a.get("href")).strip() subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore") subcattitle = str(subcattitle).strip().replace("\n", "").replace( "\t", "").replace(",", "") print >> f, ",".join([catlink, cattitle, subcatlink, subcattitle]) f.close()
def pl_to_info_collection(self): tl = self.line_list[0] tt = self.line_list[1] cl = self.line_list[2] ct = self.line_list[3] pl = self.line_list[4] page = req_proxy.main(pl) soup = BeautifulSoup(page) pt = soup.find("h1", attrs={"itemprop": "name"}) pt = pt.get_text().encode("ascii", "ignore") version = soup.find("li", attrs={"class": "qsVersion"}) version = version.get_text().encode("ascii", "ignore") filesize = soup.find("li", attrs={"class": "fileSize"}) filesize = filesize.get_text().encode("ascii", "ignore") dtadded = soup.find("li", attrs={"class": "qsDateAdded"}) dtadded = dtadded.get_text().encode("ascii", "ignore") price = soup.find("li", attrs={"class": "qsPrice"}) price = price.get_text().encode("ascii", "ignore") oosys = soup.find("li", attrs={"class": "qsOs"}) oosys = oosys.get_text().encode("ascii", "ignore") tdown = soup.find("li", attrs={"class": "qsTotalDownloads"}) tdown = tdown.get_text().encode("ascii", "ignore") wkdown = soup.find("li", attrs={"class": "qsWeeklyDownloads"}) wkdown = wkdown.get_text().encode("ascii", "ignore") direc2 = "%s/%s/%s" % (self.direc, tt, ct) try: os.makedirs(direc2) except: pass filename = "%s/%s.csv" % (direc2, ct) f = open(filename, "a+") info = map(self.mystrip, [ tl, tt, cl, ct, pl, pt, version, filesize, dtadded, price, oosys, tdown, wkdown ]) logging.debug(info) infostr = ','.join(info) f.write(infostr + "\n") f.close()
def main2(line, filename): catlink = line[0] cattitle = line[1] f = open(filename, "a+") page = req_proxy.main(catlink) soup = BeautifulSoup(page) tag_page = soup.find("div", attrs={"id":"wp_page_numbers"}) tag_page_a_list = [] if tag_page: tag_page_a = tag_page.find_all("a")[:-1] for cat_page in tag_page_a: sub_page_link = str(cat_page.get("href")).strip() sub_page = req_proxy.main(sub_page_link) sub_soup = BeautifulSoup(sub_page) tag_content_a = soup.find_all("h2", attrs={"class":"title"}) for subcatlinktitle in tag_content_a: subcatlink = str(subcatlinktitle.a.get("href")).strip() subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore") subcattitle = str(subcattitle).strip().replace("\n", "").replace("\t", "").replace(",", "") print >>f, ",".join([catlink, cattitle, subcatlink, subcattitle]) else: tag_content_a = soup.find_all("h2", attrs={"class":"title"}) for subcatlinktitle in tag_content_a: subcatlink = str(subcatlinktitle.a.get("href")).strip() subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore") subcattitle = str(subcattitle).strip().replace("\n", "").replace("\t", "").replace(",", "") print >>f, ",".join([catlink, cattitle, subcatlink, subcattitle]) f.close()
def page1_cat_link_collect(self): page = req_proxy.main(self.link) tree = html.fromstring(page) cat_box = tree.xpath("/html/body/div/div/div[5]/div[2]/div[2]/div/ul/li[2]/ul/li") all_cat_a = [] for cat_link in cat_box: all_cat_a.append(cat_link.xpath("a/@href")[0]) self.all_cat_a = all_cat_a
def cat_to_subcat(fs, link): page = req_proxy.main(link) soup = BeautifulSoup(page, "html.parser") cat_big_box = soup.find("div", attrs={"id":"matchingCatbox"}) cat_box_list = cat_big_box.find_all("a", attrs={"class":re.compile("somn-track")}) for cat_box in cat_box_list: cat_link = my_strip(cat_box.get("href")) fs.write(cat_link + "\n")
def page_link_to_movie_link(self, link): f = self.page_link_to_mov_link page = req_proxy.main(link) soup = BeautifulSoup(page) movi_link_box = soup.find("div", attrs={"id":"content"}) movi_link_list = movi_link_box.find_all("a", title=re.compile("Permanent Link")) for mov_link in movi_link_list: f.write(",".join([link, str(mov_link.get("href"))]) + "\n") logging.debug([link, str(mov_link.get("href"))])
def pl_to_info_collection(self): tl = self.line_list[0] tt = self.line_list[1] cl = self.line_list[2] ct = self.line_list[3] pl = self.line_list[4] page = req_proxy.main(pl) soup = BeautifulSoup(page, "html.parser") pt = soup.find("h1", attrs={"itemprop": "name"}) pt = pt.get_text().encode("ascii", "ignore") version = soup.find("li", attrs={"class": "qsVersion"}) version = version.get_text().encode("ascii", "ignore") filesize = soup.find("li", attrs={"class": "fileSize"}) filesize = filesize.get_text().encode("ascii", "ignore") dtadded = soup.find("li", attrs={"class": "qsDateAdded"}) dtadded = dtadded.get_text().encode("ascii", "ignore") price = soup.find("li", attrs={"class": "qsPrice"}) price = price.get_text().encode("ascii", "ignore") oosys = soup.find("li", attrs={"class": "qsOs"}) oosys = oosys.get_text().encode("ascii", "ignore") tdown = soup.find("li", attrs={"class": "qsTotalDownloads"}) tdown = tdown.get_text().encode("ascii", "ignore") wkdown = soup.find("li", attrs={"class": "qsWeeklyDownloads"}) wkdown = wkdown.get_text().encode("ascii", "ignore") direc2 = "%s/%s/%s" % (self.direc, tt, ct) try: os.makedirs(direc2) except: pass filename = "%s/%s.csv" % (direc2, ct) f = open(filename, "a+") info = map(self.mystrip, [tl, tt, cl, ct, pl, pt, version, filesize, dtadded, price, oosys, tdown, wkdown]) logging.debug(info) infostr = ",".join(info) f.write(infostr + "\n") f.close()
def movie_link_tu_page(self, link): page = req_proxy.main(link) soup = BeautifulSoup(page) try: page_links_div = soup.find("div", attrs={"id":"wp_page_numbers"}) page_links_li = page_links_div.find_all("a") for a in page_links_li[:-1]: self.movie_page_link.append(a.get("href")) except: self.movie_page_link.append(link)
def main(line, f3): #def main(line): line = line.strip() line = ast.literal_eval(line) menu = line[0] ctlink = line[1] cttitle = line[2] if cttitle == "Home & Decor": cttitle = unicode(r("Home & Décor"), 'utf8') #ctlink = "http://www.amazon.in/Sling-Bags/b/ref=sd_allcat_hbc_sling?ie=UTF8&node=1983351031" #cttitle = "Sling & Cross-Body Bags" page = req_proxy.main(ctlink) soup = BeautifulSoup(page) tag_depatrmen = soup.find("h2", text=re.compile("Department")) tag_ul = tag_depatrmen.find_next("ul") tag_strong = tag_ul.find("strong", text = re.compile(cttitle)) #tag_li = tag_strong.find_next("li") parent_li = tag_strong.find_parent("li") tag_narrow = None try: parent_li = parent_li.find_next_sibling() tag_narrow = parent_li.find("span", attrs={"class":"narrowValue"}) except: print >>f3, [menu, ctlink, cttitle, ctlink, cttitle] loop = True while loop is True: if tag_narrow is not None: tag_narrow = tag_narrow.find_next("a") sctlink = "%s%s" %("http://www.amazon.in", tag_narrow.get("href")) scttitle = tag_narrow.get_text().encode("ascii", "ignore").strip() if scttitle != "What's this?": print >>f3, [menu, ctlink, cttitle, sctlink, scttitle] tag_narrow = tag_narrow.find("span", attrs={"class":"narrowValue"}) else: loop = False
def cat_to_subcat_brand(fb, link): page = req_proxy.main(link) soup = BeautifulSoup(page, "html.parser") brand_big_box = soup.find("div", attrs={"name":"Brand"}) brand_box_list = brand_big_box.find_all("input", attrs={"filtername":"Brand"}) for brandbox in brand_box_list: brand = my_strip(brandbox.get("value")) #print "http://www.snapdeal.com/products/men-apparel-jeans/?q=Brand%3A"+ brand + "&FID=checkbox_searchable_Brand%20%3A" + brand brand_link = "%s/?q=Brand:%s" %(link, brand) fb.write(brand_link + "\n") logging.debug("inserted...................................................................................")
def main(ntl_pth_cat): link = ntl_pth_cat[0] page = req_proxy.main(link) if not page: main(ntl_pth_cat) filename = "dir%s/%s" %(time.strftime("%d%m%Y"), "cl_cpth_sc_bl_bn_bc_links_extracted.txt") f = open(filename, "a+") print >>f, link f.close() main2(ntl_pth_cat, page)
def main(): f = open("to_extract_jnglee.txt") directory = f.read().strip() f.close() filename ="%s/ml_mt_ct_cl.txt" %(directory) f = open(filename) line = f.read().strip() f.close() line_list = ast.literal_eval(line) ml_mt_cl_ct_sl_st = [] for line in line_list: ml = line[0] mt = line[1] cl = line[2] ct = line[3] parsed = urlparse(cl) link_pth_lsit = filter(None, parsed.path.split("/")) if link_pth_lsit[1] == 'b': page = req_proxy.main(cl) soup = BeautifulSoup(page, "html.parser") tag_department = soup.find("h3", text=re.compile("Department")) tag_ul = tag_department.find_next_sibling("ul") if tag_ul is None: tag_ul = tag_department.find_next_sibling("div", attrs={"class":"left_nav_section"}) tag_ul = tag_ul.find("ul") tag_a = tag_ul.find_all("a") for sl_sl in tag_a: sl = "%s%s" %("http://www.junglee.com", str(sl_sl.get("href"))) st = str(sl_sl.get_text()).strip() ml_mt_cl_ct_sl_st.append([ml, mt, cl, ct, sl, st]) f.close() filename ="%s/ml_mt_cl_ct_sl_st.txt" %(directory) f = open(filename, "w+") print >>f, ml_mt_cl_ct_sl_st f.close() return ml_mt_cl_ct_sl_st
def main(): directory = "dirjnglee%s" %(time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("extracted_jnglee.txt", "a+") f.write(directory) f.close() f = open("to_extract_jnglee.txt", "w+") f.write(directory) f.close() Health_beauti = {} Health_beauti["Health-Personal-Care"] = "http://www.junglee.com/Health-Personal-Care/b/683850031/ref=nav_menu_6_1_1_0" Health_beauti["Beauty"] = "http://www.junglee.com/Beauty/b/837260031/ref=nav_menu_6_2_1_0" Health_beauti["Clothing"] ="http://www.junglee.com/Clothing/b/683843031/ref=nav_menu_2_1_1_0" Health_beauti["Shoes"] = "http://www.junglee.com/Shoes/b/805169031/ref=nav_menu_2_2_1_0" Health_beauti["Watches"] = "http://www.junglee.com/Watches/b/683890031/ref=nav_menu_2_3_1_0" Health_beauti["Accessories-online"] = "http://www.junglee.com/buy/Accessories-online/1000702243/ref=nav_menu_2_4_1_0" Health_beauti["Jewellery"] = "http://www.junglee.com/Jewellery/b/683862031/ref=nav_menu_2_5_1_0" ml_mt_ct_cl = [] for mt, ml in Health_beauti.items(): page = req_proxy.main(ml) soup = BeautifulSoup(page, "html.parser") cat_div = soup.find("div", attrs={"id":"left-1"}) catt_catl = cat_div.find_all("a") for ct_cl in catt_catl: cl = "%s%s" %("http://www.junglee.com", str(ct_cl.get("href"))) ct = str(ct_cl.get_text()).strip() ml_mt_ct_cl.append([ml, mt, cl, ct]) filename = "%s/ml_mt_ct_cl.txt" %(directory) f = open(filename, "w+") print >>f, ml_mt_ct_cl f.close() return ml_mt_ct_cl
def main3(link): page = req_proxy.main(link) soup = BeautifulSoup(page) tag_first_container = soup.find("div", attrs= {"class":"filter-container first"}) tag_div_cat = soup.find("div", text = re.compile("categories")) if tag_div_cat: return tag_first_container else: pass
def lnk_to_cat_collecion(self, link): page = req_proxy.main(link) soup = BeautifulSoup(page) parsed = urlparse(link) link_path = filter(None, parsed.path.split("/")) tittl = link_path[1] cat_div = soup.find("dl", attrs={"class":"catNav"}) cl_ct_list = cat_div.find_all("dd") for cl_ct in cl_ct_list: cl = cl_ct.a.get("href") cl = "%s%s" %("http://download.cnet.com", str(cl)) ct = str(cl_ct.a.get_text()) print >>self.f, [link, tittl] + map(self.mystrip, [cl, ct]) self.lnk_ttl_cl_ct.append([link, tittl] + map(self.mystrip, [cl, ct]))
def main(line, f3): line = line.strip() line = ast.literal_eval(line) menu = line[0] ctlink = line[1] cttitle = line[2] sctlink = line[3] scttitle = line[4] page = req_proxy.main(sctlink) soup = BeautifulSoup(page) tag_brands = soup.find("h2", text=re.compile("Brands")) tag_ul = tag_brands.find_next("ul") tag_span_see_more = tag_ul.find("span", attrs={"class": "refinementLink seeMore"}) if tag_span_see_more is not None: all_brand_link = "%s%s" % ( "http://www.amazon.in", tag_span_see_more.find_parent("a").get("href")) main2(line, all_brand_link, f3) else: tag_al = tag_ul.find_all("a") for al in tag_al: bl = "%s%s" % ("http://www.amazon.in", al.get("href")) bt = al.span.get_text().encode("ascii", "ignore") filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt] filedata2 = map(mystrip, filedata) logging.debug(filedata2) f4 = open(f3, "a+") print >> f4, filedata2 f4.close()
def main2(ml_mt_sub, filename): f = open(filename, "a+") menulink = ml_mt_sub[0] menutitle = ml_mt_sub[1] page = req_proxy.main(menulink) soup = BeautifulSoup(page) tag_box = soup.find_all("div", attrs={"class": "head clearfix"}) for al in tag_box: cato = al.find("div") catolink = "%s%s" % ("http://www.homeshop18.com", str(cato.a.get("href")).strip()) catotitle = cato.a.get_text() sub_cato = al.find_next_sibling("div") if sub_cato: sub_cato2 = sub_cato.find_all("a") for al in sub_cato2: sub_catolink = "%s%s" % ("http://www.homeshop18.com", str(al.get("href")).strip()) sub_catotext = al.get("title") print >> f, ','.join([ menulink, menutitle, catolink, catotitle, sub_catolink, sub_catotext ]) else: print >> f, ','.join([ menulink, menutitle, catolink, catotitle, catolink, catotitle ]) f.close()
def main3(line, f2): line = line.strip() line2 = line.split(",") catlink = line2[-2] page = req_proxy.main(catlink) soup = BeautifulSoup(page) subcat = soup.find("select", attrs={"id": "categorySelect"}) try: subcatoption = subcat.find_all("option") for subcatop in subcatoption: print >> f2, "%s,%s" % (line, str(subcatop.get_text()).strip()) logging.debug("%s,%s" % (line, str(subcatop.get_text()).strip())) except: print >> f2, "%s,%s" % (line, "None") logging.debug("%s,%s" % (line, "None"))
def main(): directory = "dir%s/categories" % (time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extract_cat.txt", "w+") print >> f, directory f.close() f = open("extracted_cat.txt", "a+") print >> f, directory f.close() link = "http://www.filmlinks4u.net/" page = req_proxy.main(link) soup = BeautifulSoup(page) tag_cat = soup.find("li", attrs={"id": "categories-3"}) tag_cat_a_list = tag_cat.find_all("a") catlink_cattitle = [] for al in tag_cat_a_list: catlink = str(al.get("href")).strip() cattitle = str(catlink.split("/")[-1]).strip().replace( ",", "").replace("\n", "").replace("\t", "") catlink_cattitle.append([catlink, cattitle]) mainthreading(catlink_cattitle)
def main(line): line = ast.literal_eval(line) line = map(str.strip, line) target = line[0] catlink = line[1] cattitle = line[2] subcatlink = line[3] subcattitle = line[4] brand = line[5] productlink = line[6] imagelink = line[7] producttitle = line[8] f = open("to_extractbagittoday") directory = f.read().strip() f.close() dir2 = "%s/%s/%s/%s/%s" % (directory, target, cattitle, subcattitle, brand) try: os.makedirs(dir2) except: pass filename = "%s/%s.csv" % (dir2, brand) f = open(filename, "a+") page = req_proxy.main(productlink) soup = BeautifulSoup(page) product_path = productlink.split("/") product_path = filter(None, product_path) sku = str(product_path[-1]) start = sku.find("-") if start != -1: sku = sku[start + 1:] sp = soup.find("span", attrs={"class": "offer-price"}) sp = str(sp.get_text()) mrp = sp mrp2 = soup.find("span", attrs={"class": "mrp-price"}) if mrp2 is not None: mrp = str(mrp2.get_text()) colour = None colour2 = soup.find("span", attrs={"class": "colorClass"}) if colour2 is not None: colour = str(colour2.get("title")) size = [] size2 = soup.find("ul", attrs={"class": "attributes-ul-cont"}) if size2 is not None: size2 = size2.find_all("input") for sz in size2: size.append(str(sz.get("value"))) size = str(size) tree = html.fromstring(page) desc2 = tree.xpath( "/html/body/div/section/div[4]/div/section/div/div/div/div") desc2 = desc2[0] desc = str(html.tostring(desc2)) spec = None date = str(time.strftime("%d%mi%Y")) status = "None" vender = "bagittoday.com" metadesc2 = soup.find("meta", attrs={"name": "description"}) metadesc = str(metadesc2.get("content")) metatitle2 = soup.find("title") metatitle = str(metatitle2.get_text()) output = [ sku, producttitle, catlink, sp, cattitle, subcattitle, brand, imagelink, mrp, colour, target, productlink, vender, metatitle, metadesc, size, desc, spec, date, status ] output = map(string_strip, output) print >> f, ','.join(output) print output
def main(filename, brandname, catname, l): try: item_link = l page = req_proxy.main(item_link) soup = BeautifulSoup(page) tag_dis = soup.find("div", attrs={"id":"description"}) if tag_dis: tag_dis = str(tag_dis).replace("\n","") tag_spec = soup.find("div", attrs={"id":"specifications"}) if tag_spec: tag_spec = str(tag_spec).replace("\n","") tag_h1 = soup.find("h1", attrs={"itemprop":"name"}) item_title = str(tag_h1.get_text()).strip() try: tag_colour = soup.find("div", attrs={"class":"line extra_text bmargin10"}) item_clour = str(tag_colour.get_text()).strip() except: item_clour = " No more colour" tag_img = soup.find("img", attrs={"id":"visible-image-small"}) item_image = tag_img.get("src") try: tag_price = soup.find("span", attrs={"class":"fk-font-verybig pprice fk-bold"}) item_price = str(tag_price.get_text()).strip() except: tag_price = soup.find("div", attrs={"class":"prices"}) item_price = str(tag_price.get_text()).strip().replace("\n", " ") try: tag_mrp = soup.find("span", attrs={"id":"fk-mprod-list-id"}) item_mrp = str(tag_discount.get_text()).strip() except: item_mrp = item_price tag_seller = soup.find("a", attrs={"class":"pp-seller-badge-name fk-bold"}) item_seller = str(tag_seller.get_text()).strip() try: tag_sku = soup.find("a", attrs={"class":"btn btn-orange btn-buy-big fk-buy-now fkg-pp-buy-btn"}) sku = str(tag_sku.get("href")).split("=")[-1].strip() except: sku = "no sku defined" size = [] try: tag_multiselect = soup.find_all("div", attrs={"class":"multiselect-item"}) for l in tag_multiselect: try: size.append(str(l.get_text())) except: pass except: pass if not size: size.append("No size defined") size2 = ' '.join(size).replace("\n", " ") del size[:] del size date = str(time.strftime("%d:%m:%Y")).strip() f = open(filename,"a+") print >>f, ','.join([date, catname, brandname, item_title, item_price, item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)]) f.close() logging.debug([date, catname, brandname, item_title, item_price, item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)]) except: f = open("newerrorfile.txt", "a+") print >>f, l f.close()
enclosure_queue = Queue() logging.basicConfig(level=logging.DEBUG, format='[%(levelname)s] (%(threadName)-10s) %(message)s', ) def main2(i, q): while True: filename, brandname, catname, l = q.get() item_link = l <<<<<<< HEAD #page = urll_proxy.main(l) #assert page page = req_proxy.main(l) soup = BeautifulSoup(page) #page.close() try: tag_dis = soup.find("div", attrs={"id":"description"}) if tag_dis: tag_dis = str(tag_dis).replace("\n","") tag_spec = soup.find("div", attrs={"id":"specifications"}) if tag_spec: tag_spec = str(tag_spec).replace("\n","") tag_h1 = soup.find("h1", attrs={"itemprop":"name"}) item_title = str(tag_h1.get_text()).strip()
def main2(line): f = open("to_extract_cat.txt") directory = f.read().strip() f.close() filename2 = "%s/%s" % (directory, "f_ct_cl_st_sl_vt_wp_wl_img.txt") f2 = open(filename2, "a+") line2 = line.split(",") catlink = str(line2[0]).strip() cattitle = str(line2[1]).replace("\n", " ").replace("\t", "").replace("\r", "").strip() sub_catlink = line2[2] sub_cattitle = line2[3].replace("\n", " ").replace("\t", "").replace("\r", "").strip() end = sub_cattitle.find(")").strip() if end != -1: sub_cattitle = sub_cattitle[:end + 1] page = req_proxy.main(sub_catlink) soup = BeautifulSoup(page) tree = html.fromstring(page) image = "None" if tree is not None: image = str( tree.xpath( '/html/body/div/div/div[5]/div/div[2]/div/div/div[3]/a/img/@src' )[0]).strip() tag_hotserver = soup.find_all("span", text=re.compile("Host Server")) for l in tag_hotserver: loop = True video_type = str(l.next_sibling).replace("\n", " ").replace( "\t", "").replace("\r", "").strip() next_a = l.find_next("a") while loop is True: try: tag_nm = str(next_a.name).strip() if tag_nm == "br": pass if tag_nm == "a": watchlink = str(next_a.get("href")).strip() watchpart = str(next_a.get_text()).replace( "\n", " ").replace("\t", "").replace("\r", "").strip() print >> f2, ','.join([ cattitle, catlink, sub_cattitle, sub_catlink, video_type, watchpart, watchlink, image ]) print ','.join([ cattitle, catlink, sub_cattitle, sub_catlink, video_type, watchpart, watchlink, image ]) if tag_nm != "a" and tag_nm != "br": loop = False next_a = next_a.find_next_sibling() except: loop = False f.close()