def crawl_category(self, ctx='', **kwargs): res = requests.get(HOST) res.raise_for_status() tree = lxml.html.fromstring(res.content) dept_nodes = tree.cssselect('div#top-navigation ul.navigation li.menu-item a') for dept_node in dept_nodes: key = dept_node.text.strip() if 'brand' in key.lower(): continue combine_url = dept_node.get('href') match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) r = requests.get(combine_url) r.raise_for_status() t = lxml.html.fromstring(r.content) pagesize_node = None link_nodes = t.cssselect('div.atg_store_filter ul.atg_store_pager li') for link_node in link_nodes: if link_node.get('class') and 'nextLink' in link_node.get('class'): break pagesize_node = link_node pagesize = int(pagesize_node.cssselect('a')[0].text.strip()) if pagesize_node else 1 is_new = False; is_updated = False category = Category.objects(key=key).first() if not category: is_new = True category = Category(key=key) category.is_leaf = True if combine_url and combine_url != category.combine_url: category.combine_url = combine_url is_updated = True if pagesize and pagesize != category.pagesize: category.pagesize = pagesize is_updated = True category.hit_time = datetime.utcnow() category.save() print category.key; print category.cats; print category.pagesize; print category.combine_url; print is_new; print is_updated; print; common_saved.send(sender=ctx, obj_type='Category', key=category.key, url=category.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) )
def crawl_category(self, ctx='', **kwargs): res = requests.get(HOST) res.raise_for_status() tree = lxml.html.fromstring(res.content) primary_dept_nodes = tree.cssselect('nav#primary-nav > ul li.tab') for primary_dept_node in primary_dept_nodes: primary_dept = primary_dept_node.cssselect('h2 a span')[0].text if primary_dept.lower() == 'brands': continue sub_dept = None sub_dept_nodes = primary_dept_node.cssselect('div.nav-category div.nav-category-column ul li a') for sub_dept_node in sub_dept_nodes: sub_dept = sub_dept_node.text combine_url = sub_dept_node.get('href') key = combine_url.split('/')[-1] match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) is_new = False; is_updated = False category = Category.objects(key=key).first() if not category: is_new = True category = Category(key=key) category.is_leaf = True if primary_dept not in category.cats: category.cats.append(primary_dept) is_updated = True if sub_dept not in category.cats: category.cats.append(sub_dept) is_updated = True if combine_url and ((category.combine_url and combine_url.split('?')[0] != category.combine_url.split('?')[0]) \ or not category.combine_url): category.combine_url = combine_url is_updated = True category.hit_time = datetime.utcnow() category.save() common_saved.send(sender=ctx, obj_type='Category', key=category.key, url=category.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) )
def crawl_category(self, ctx='', **kwargs): res = requests.get(HOST) res.raise_for_status() tree = lxml.html.fromstring(res.content) dept_nodes = tree.cssselect('div#nav div#moreDeptsWrap ul li a') for dept_node in dept_nodes: dept = dept_node.text.strip() href = dept_node.get('href') match = re.search(r'https?://.+', href) if not match: href = '%s%s' % (HOST, href) res = requests.get(href) res.raise_for_status() tree = lxml.html.fromstring(res.content) sub_dept_nodes = tree.cssselect('div#tcSideCol h5 a') for sub_dept_node in sub_dept_nodes: sub_dept = sub_dept_node.text.strip() cats = [dept, sub_dept] key = '_'.join(cats) combine_url = sub_dept_node.get('href') match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) is_new = False; is_updated = False category = Category.objects(key=key).first() if not category: is_new = True category = Category(key=key) category.is_leaf = True if set(cats).difference(category.cats): category.cats = list(set(cats) | set(category.cats)) is_updated = True if combine_url and combine_url != category.combine_url: category.combine_url = combine_url is_updated = True category.hit_time = datetime.utcnow() category.save() common_saved.send(sender=ctx, obj_type='Category', key=category.key, url=category.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) )
def crawl_category(self, ctx='', **kwargs): res = requests.get(HOST) res.raise_for_status() tree = lxml.html.fromstring(res.content) primary_cat_nodes = tree.cssselect('div.nav ul.menu li.menu-column') for primary_cat_node in primary_cat_nodes: primary_cat = primary_cat_node.cssselect('a.menu-column-link')[0].text.strip() if primary_cat and 'designer' in primary_cat.lower(): continue sub_cat_nodes = primary_cat_node.cssselect('ul.sub-menu li.sub-menu-column ul li a') for sub_cat_node in sub_cat_nodes: sub_cat = sub_cat_node.text.strip() combine_url = sub_cat_node.get('href') key = re.sub(HOST, '', combine_url) key = re.sub('/', '_', key) is_new = False; is_updated = False category = Category.objects(key=key).first() if not category: is_new = True category = Category(key=key) category.is_leaf = True if primary_cat not in category.cats: category.cats.append(primary_cat) is_updated = True if combine_url and combine_url != category.combine_url: category.combine_url = combine_url is_updated = True category.hit_time = datetime.utcnow() category.save() # print category.key; print category.cats; print category.combine_url; print is_new; print is_updated; print common_saved.send(sender=ctx, obj_type='Category', key=category.key, url=category.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) )