def crawl(self): """ Main function in the crawling process. Core algorithm is: q <- starting page while q not empty: url <- q.get() if url is new and suitable: page <- fetch(url) q.put(urls found in page) else: nothing new and suitable means that we don't re-visit URLs we've seen already fetched, and user-supplied criteria like maximum search depth are checked. """ q = Queue() q.put((self.root, 0)) while not q.empty(): this_url, depth = q.get() #Non-URL-specific filter: Discard anything over depth limit if depth > self.depth_limit: continue #Apply URL-based filters. do_not_follow = [f for f in self.pre_visit_filters if not f(this_url)] #Special-case depth 0 (starting URL) if depth == 0 and [] != do_not_follow: print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow #If no filters failed (that is, all passed), process URL if [] == do_not_follow: try: self.visited_links.add(this_url) self.num_followed += 1 page = Fetcher(this_url) page.fetch() for link_url in [self._pre_visit_url_condense(l) for l in page.out_links()]: if link_url not in self.urls_seen: q.put((link_url, depth+1)) self.urls_seen.add(link_url) do_not_remember = [f for f in self.out_url_filters if not f(link_url)] if [] == do_not_remember: self.num_links += 1 self.urls_remembered.add(link_url) link = Link(this_url, link_url, "href") if link not in self.links_remembered: self.links_remembered.add(link) except Exception, e: print >>sys.stderr, "ERROR: Can't process url '%s' (%s)" % (this_url, e)
def getLinks(url): page = Fetcher(url) page.fetch() for i, url in enumerate(page): print "%d. %s" % (i, url)
'admin_cs2_password'] if 'admin_cs2_password' in config else None backup_dir = config['backups_path'] \ if 'backups_path' in config and config['backups_path'] != "." \ else script_dir report_path = config['report_path'] if 'report_path' in config else None latest_backup_symlink = config[ 'latest_backup_symlink'] if 'latest_backup_symlink' in config else None if not os.path.isdir(backup_dir): os.makedirs(backup_dir) a = Authenticator(event_name, c2_login, c2_password, interactive=False) if not a.sign_in(): exit() f = Fetcher(a.event_name, a.cookie) if not f.fetch_data(): exit() f.fetch_etickets() f.fetch_details() db_path = os.path.join(backup_dir, datetime.now().strftime('%y-%m-%d_%H-%M-%S.db')) MakeDB(db_path, f.data) if latest_backup_symlink: os.remove(latest_backup_symlink) if os.path.exists( latest_backup_symlink) else None try: os.symlink(db_path, latest_backup_symlink) except OSError:
def crawl(self): """ Main function in the crawling process. Core algorithm is: q <- starting page while q not empty: url <- q.get() if url is new and suitable: page <- fetch(url) q.put(urls found in page) else: nothing new and suitable means that we don't re-visit URLs we've seen already fetched, and user-supplied criteria like maximum search depth are checked. """ q = Queue() q.put((self.root, 0)) while not q.empty(): this_url, depth = q.get() #Non-URL-specific filter: Discard anything over depth limit if depth > self.depth_limit: continue #Apply URL-based filters. do_not_follow = [ f for f in self.pre_visit_filters if not f(this_url) ] #Special-case depth 0 (starting URL) if depth == 0 and [] != do_not_follow: print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow #If no filters failed (that is, all passed), process URL if [] == do_not_follow: try: self.visited_links.add(this_url) self.num_followed += 1 page = Fetcher(this_url) page.fetch() for link_url in [ self._pre_visit_url_condense(l) for l in page.out_links() ]: if link_url not in self.urls_seen: q.put((link_url, depth + 1)) self.urls_seen.add(link_url) do_not_remember = [ f for f in self.out_url_filters if not f(link_url) ] if [] == do_not_remember: self.num_links += 1 self.urls_remembered.add(link_url) link = Link(this_url, link_url, "href") if link not in self.links_remembered: self.links_remembered.add(link) except Exception, e: print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % ( this_url, e)
def setUp(self): self.fetcher = Fetcher()
class FetcherTest(unittest.TestCase): """ testing for success of Fetcher functions """ def get_data_file(self, filename): """ returns the content of a test data file in ./data""" test_data = os.path.join(os.path.dirname(__file__), "data") f = open(os.path.join(test_data,filename)) data = f.read() f.close() return data def setUp(self): self.fetcher = Fetcher() def test_get_full_url(self): control_data = "https://api.github.com/helloworld" result_url = self.fetcher.get_full_url("helloworld") self.assertEqual(control_data, result_url, "Full URL does not match: "+control_data+" vs "+result_url) def test_process_repo_single_repo(self): self.fetcher.get_from_net = Mock(return_value=self.get_data_file("octocat.Spoon-Knife.json")) result = self.fetcher.process_repo("") self.assertIsInstance(result, type(list()), "Result was not a list") self.assertNotEqual(len(result), 0, "List is empty") self.assertEqual(len(result), 1, "List has extra items") self.assertIsInstance(result[0], type(dict()), "List item is not a dictionary: "+repr(result[0])) # testing membership self.assertIn("full_name", result[0], "Full name missing from dictionary: "+ repr(result[0])) self.assertIn("name", result[0], "Name missing from dictionary: "+ repr(result[0])) self.assertIn("fork", result[0], "Fork missing from dictionary: "+ repr(result[0])) self.assertIn("url", result[0], "URL missing from dictionary: "+ repr(result[0])) self.assertIn("language", result[0], "Language missing from dictionary: "+ repr(result[0])) self.assertIn("created", result[0], "Created missing from dictionary: "+ repr(result[0])) # testing values self.assertEqual(result[0]["full_name"], "octocat/Spoon-Knife", "Full name does not match, Fullname: "+ repr(result[0]["full_name"])) self.assertEqual(result[0]["name"], "Spoon-Knife", "Name does not match, Name: "+ repr(result[0]["name"])) self.assertEqual(result[0]["fork"], False, "Fork does not match, Fork: "+ repr(result[0]["fork"])) self.assertEqual(result[0]["url"], "https://api.github.com/repos/octocat/Spoon-Knife", "URL does not match, URL: "+ repr(result[0]["url"])) self.assertEqual(result[0]["language"], None, "Langauge does not match, Language: "+ repr(result[0]["language"])) self.assertEqual(result[0]["created"], "2011-01-27T19:30:43Z", "Created does not match, Created: "+ repr(result[0]["created"])) def test_process_repo_multiple_repo(self): self.fetcher.get_from_net = Mock(return_value=self.get_data_file("octocat.json")) result = self.fetcher.process_repo("", True) self.assertIsInstance(result, type(list()), "Result was not a list") self.assertNotEqual(len(result), 0, "List is empty") self.assertEqual(len(result), 3, "List has extra items") self.assertIsInstance(result[0], type(dict()), "List item is not a dictionary: "+repr(result[0])) self.assertIsInstance(result[1], type(dict()), "List item is not a dictionary: "+repr(result[0])) self.assertIsInstance(result[2], type(dict()), "List item is not a dictionary: "+repr(result[0])) # testing membership - first item self.assertIn("full_name", result[0], "Full name missing from dictionary: "+ repr(result[0])) self.assertIn("name", result[0], "Name missing from dictionary: "+ repr(result[0])) self.assertIn("fork", result[0], "Fork missing from dictionary: "+ repr(result[0])) self.assertIn("url", result[0], "URL missing from dictionary: "+ repr(result[0])) self.assertIn("language", result[0], "Language missing from dictionary: "+ repr(result[0])) self.assertIn("created", result[0], "Created missing from dictionary: "+ repr(result[0])) # testing membership - second item self.assertIn("full_name", result[1], "Full name missing from dictionary: "+ repr(result[1])) self.assertIn("name", result[1], "Name missing from dictionary: "+ repr(result[1])) self.assertIn("fork", result[1], "Fork missing from dictionary: "+ repr(result[1])) self.assertIn("url", result[1], "URL missing from dictionary: "+ repr(result[1])) self.assertIn("language", result[1], "Language missing from dictionary: "+ repr(result[1])) self.assertIn("created", result[1], "Created missing from dictionary: "+ repr(result[1])) # testing values - third item self.assertEqual(result[2]["full_name"], "octocat/ThisIsATest", "Full name does not match, Fullname: "+ repr(result[2]["full_name"])) self.assertEqual(result[2]["name"], "ThisIsATest", "Name does not match, Name: "+ repr(result[2]["name"])) self.assertEqual(result[2]["fork"], False, "Fork does not match, Fork: "+ repr(result[2]["fork"])) self.assertEqual(result[2]["url"], "https://api.github.com/repos/octocat/ThisIsATest", "URL does not match, URL: "+ repr(result[2]["url"])) self.assertEqual(result[2]["language"], None, "Langauge does not match, Language: "+ repr(result[2]["language"])) self.assertEqual(result[2]["created"], "2012-03-07T23:25:47Z", "Created does not match, Created: "+ repr(result[2]["created"])) def tearDown(self): self.fetcher = None
event_name = config['event_name'] c2_login = config['admin_cs2_name'] c2_password = config[ 'admin_cs2_password'] if 'admin_cs2_password' in config else None db_path = config['db_path'] sql = config['sql_after_get'].strip( ) if 'sql_after_get' in config else None all_data = len(sys.argv) > 1 and sys.argv[1] == '-a' a = Authenticator(event_name, c2_login, c2_password) if not a.sign_in(): exit() print() f = Fetcher(a.event_name, a.cookie) if not f.fetch_data(): exit() if all_data: if not f.fetch_etickets(): exit() if not f.fetch_details(): exit() print('\nCreating ' + db_path + '...') MakeDB(db_path, f.data) if sql: from tabulate import tabulate
def main(user='', repo='', logfile='', frmt='json'): """Entry point :param user: username :param repo: reponame (if username is provided), fullname (if username isn't provided) :param logfile: logfile :param frmt: csv, json (not yet implemented, only does csv) """ repo_url = '' single_repo = False #flag : if fetching one repo (True) or many (False) public_repos = False #multiple repos are public repos count = 0 #max repos to fetch if public_repos is selected if user != '' and repo != '': fullname = "{0}/{1}".format(user, repo) repo_url = REPO_URL.format(full_name=fullname) single_repo = True elif user != '': repo_url = USER_REPO_LIST.format(user=user) single_repo = False elif repo != '': #fullname of repo is provided repo_url = REPO_URL.format(full_name=repo) single_repo = True else: #fetch public repos ans = raw_input('fetch all repos [y/n]? ') if ans in ('y', 'Y'): count = raw_input('maximum number of repos [{0}]? '.format(DEFAULT_MAX_PUBLIC_REPOS)) if count == '': count = DEFAULT_MAX_PUBLIC_REPOS else: try: count = int(count) except: print 'Invalid integer' repo_url = ALL_REPO_LIST single_repo = False public_repos = True else: game_over('no repo/user selected') fetcher = Fetcher() repo_url = fetcher.get_full_url(repo_url) repo_dets = None if single_repo: repo_dets = fetcher.process_repo(repo_url) elif public_repos: repo_dets = fetcher.get_public_repos(count) else: repo_dets = fetcher.process_repo(repo_url, multiple=True) if logfile == '': #TODO use reponame if logfile is not present logfile = stdout else: logfile = open(logfile, 'w') if type(logfile) is not file: game_over('nowhere to log') print 'gotten', len(repo_dets), 'repos' for i in repo_dets: commits = fetcher.extract_commits(i) fetcher.write_commits(logfile, i, commits) #TODO write file in JSON if logfile != stdout: logfile.close()
#!/usr/bin/python3.3 # -------------------------------------------------------- # Copyright (c) 2013 Matthew Pate and Daniel Catalano # [This program is licensed under the "MIT License"] # Please see the file COPYING in the source distribution # of this software for license terms. # -------------------------------------------------------- # adds project root directory to PYTHONPATH needed for the next import statement from sys import path path.append('..') from lib.fetcher import Fetcher spider = Fetcher() spider.download_param_grib_range('gfs', 2013082400, 00, 240, 12, 1.0) spider.download_param_grib_range('gfs', 2013082400, 00, 240, 12, 0.5) #spider.download_param_grib_range('gfs', 2013080300, 00, 240, 12, 2.5)