def my_parse(self, response): log.msg('Parsing urls from %s' % response.url, level=log.INFO) # http://my.linkedin.com/directory/people/a.html lx1 = SgmlLinkExtractor( allow= '(' + self.base_url + ')?' + r'/directory/people/([a-z]|\@)\.html', deny=(self.deny_re), ) # http://my.linkedin.com/directory/people/my/A1.html lx2 = SgmlLinkExtractor( allow= '('+ self.base_url + ')?' + r'/directory/people/my/[A-Z]\d+\.html', deny=(self.deny_re), ) # http://my.linkedin.com/directory/people/my/ahamid-3.html # http://my.linkedin.com/directory/people/my/aan.html lx3 = SgmlLinkExtractor( allow= '(' + self.base_url + ')?' + r'/directory/people/my/[a-z]+(\-\d+)?\.html', deny=(self.deny_re), ) # http://my.linkedin.com/pub/zarita-a-baharum/23/9a2/756 lx4 = SgmlLinkExtractor( allow= '(' + self.base_url +')?' + r'/pub/[a-z\-]+/[a-z0-9]+/[a-z0-9]+/[a-z0-9]+', deny=(self.deny_re), ) # http://www.linkedin.com/in/levananh lx5 = SgmlLinkExtractor( allow= '(' + self.base_url + ')?' + r'/in/[a-z0-9]+$', deny=(self.deny_re), ) try: l1 = lx1._extract_links(response.body, response.url, 'utf-8') l1 = lx1._process_links(l1) l2 = lx2._extract_links(response.body, response.url, 'utf-8') l2 = lx2._process_links(l2) l3 = lx3._extract_links(response.body, response.url, 'utf-8') l3 = lx3._process_links(l3) l4 = lx4._extract_links(response.body, response.url, 'utf-8') l4 = lx4._process_links(l4) l5 = lx5._extract_links(response.body, response.url, 'utf-8') l5 = lx5._process_links(l5) links = [URL(main_url = response.url, found_urls = l1[i].url) for i in range(len(l1))] links.extend([URL(main_url = response.url, found_urls = l2[i].url) for i in range(len(l2))]) links.extend([URL(main_url = response.url, found_urls = l3[i].url) for i in range(len(l3))]) links.extend([URL(main_url = response.url, found_urls = clean_url(l4[i].url)) for i in range(len(l4))]) links.extend([URL(main_url = response.url, found_urls = clean_url(l5[i].url)) for i in range(len(l5))]) s = 'http://' + CountryCode.code if s in response.url: links.append(URL(main_url = response.url, found_urls = '$')) except: pass pub_re = [r'/pub/[a-z\-]+/[a-z0-9]+/[a-z0-9]+/[a-z0-9]+', r'/in/[a-z0-9]+' ] for pub in pub_re: if re.search(pub, response.url): self.extract(response) # extract profiles self.db.insert_urls(links)
def _process_links(self, links): links = SgmlLinkExtractor._process_links(self, links) return links