def audit(self, origin, response): """ :origin: original url. all url need match with original url :return: list url obj """ while len(self.QUEUES) > 0: url_ = self.QUEUES.pop() self.debug(" [*] Crawling URL: " + url_.get_url()) # print debug self.RESULTS.append(url_) header, response = self.connect_getdata(url_.domain, url_.port, url_.get_module()) links = self.get_links(response, self.domain, self.port, url_.folder) for link in links: url = URL(link) if not self.is_in_results(url): if not self.is_in_queues(url): self.QUEUES.insert(0, url) self.debug(url.get_url()) self.debug_socket(url.get_url()) self.RESULTS = filter(None, self.RESULTS)
import sys from DB import DB from URL import URL db = DB('citeseerx.db') db.create_tables() # db.del_all() # http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057 if len(sys.argv) == 2: url = URL(sys.argv[1]) url.open() db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()}) else: print 'Please supply proper URL.'
}) # add citations cit_html = url.get_citations() soup = BeautifulSoup(cit_html, "html.parser") trs = soup.findAll('tr', {'class': None, 'id': None}) for tr in trs: td = tr.findAll('td')[1] a = td.find('a') href = a['href'] if (href.find('viewdoc') >= 0): urlt = 'http://citeseerx.ist.psu.edu/viewdoc/summary' + href[ href.find('?'):] urlt = URL(urlt) urlt.open() print ' -> ', urlt.get_url() if (urlt.status_ok()): # print tr.find('p', {'class': 'citationContext'}) if tr.find('p', {'class': 'citationContext'}): context = tr.find('p', { 'class': 'citationContext' }).findAll(text=True)[0] else: context = '' if not db.exists('citations', { 'doi_f': url.get_doi(), 'doi_t': urlt.get_doi() }): db.insert( 'citations', { 'doi_f': url.get_doi(),