예제 #1
0
 def test_link_1(self):
     elements = map(TestElement, range(10))
     links = Links(elements, distance)
     distances = links.values()
     self.assertTrue(max(distances) == 2)
     self.assertEqual(elements[0].linked, [elements[1], elements[2]])
     self.assertEqual(links.info(elements[2], elements[4]), 2)
     self.assertIsNone(links.info(elements[2], elements[5]), None)
예제 #2
0
 def test_link_1(self):
     elements = map(TestElement, range(10))
     links = Links(elements, distance)
     distances = links.values()
     self.assertTrue( max(distances)==2 )
     self.assertEqual(elements[0].linked, [elements[1], elements[2]])
     self.assertEqual(links.info(elements[2], elements[4]), 2)
     self.assertIsNone(links.info(elements[2], elements[5]), None)
예제 #3
0
파일: app.py 프로젝트: dmtrbrlkv/LinksApi
def visited_domains():
    try:
        frm, to = get_visited_domains_params(request.args, ("from", "to"))
    except Exception as e:
        return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST)

    try:
        data = redis_connector.get_by_range(frm, to)
    except Exception as e:
        return make_json_response(status=ERROR_INTERNAL, code=HTTPStatus.INTERNAL_SERVER_ERROR)

    links = Links(data)
    return make_json_response(key=DOMAINS_KEY, data=links.get_domains())
예제 #4
0
def get_all_asset_links(zettel_paths):
    all_Links = Links()

    # For each zettel.
    for zettel_path in zettel_paths:
        # Get the contents of the zettel.
        with open(zettel_path, 'r', encoding='utf8') as file:
            contents = file.read()

        # Get the asset links in the contents and save them
        # with all the other links in the Links object.
        links = get_asset_links(contents, zettel_path)
        all_Links.add(links)

    return all_Links
예제 #5
0
 def reconstruct(self, simptcs, detector):
     self.pfinput = PFInput(simptcs)
     elements = self.pfinput.element_list()
     elements = merge_clusters(elements, 'hcal_in')
     elements = merge_clusters(elements, 'ecal_in')
     self.links = Links(elements, distance)
     self.pfreco = PFReconstructor(self.links, detector, self.logger)
예제 #6
0
    def crawl(self, homepage):
        """ crawls web """

        url = homepage

        rep = r.get(url)

        if rep.status_code == 200:

            soup = bs(rep.text, 'html.parser')

            links = soup.find_all('a')

            for l in links:
                link = (l.get('href'))
                if link:
                    if not link.startswith("http://"):
                        link = url.split('/', 3)[0] + "//" + url.split(
                            '/', 3)[1] + url.split('/', 3)[2] + "/" + link

                    if Links.process_links(link):
                        self.count += 1

                        if self.count > 100:
                            break

                        if url[7:-9] not in link:
                            break
                        self.crawl(link)
예제 #7
0
def get_asset_links(contents, zettel_path):
    assert os.path.isabs(zettel_path) or zettel_path == ''

    links = Links()
    for link_match in re.finditer(asset_link_pattern, contents):
        link_dict = link_match.groupdict()

        # URLs that end with '.html' or '.htm' could be in the list.
        # Ignore them, but not locally saved files with those endings.
        if link_dict['link'].endswith('.html') or link_dict['link'].endswith(
                '.htm'):
            if html_link_is_URL(link_dict['link']):
                continue

        links.append(link_dict['link'], link_dict['name'], zettel_path)

    return links
예제 #8
0
    def spi_response(self, response, *args, **kwargs):
        '''Response of the spi_request are handled here
        '''

        if 'text/html' in response.headers['Content-Type']:
            hash_val = Hasher.HashMD5(response.content)
            if hash_val not in self.URLhash:
                self.URLhash.add(hash_val)
                self.URLset.union(Links.parse_link(response))
예제 #9
0
 def spi_response(self, response):
     '''Response of the spi_request are handled here
     '''
     if 'text/html' in response.headers[
             'Content-Type'] and response.status_code == 200:
         hash_val = Hasher.HashMD5(response.content)
         if self.redis.getVariable(hash_val) is None:
             if self.database.isConn():
                 self.database.saveData(hash=hash_val,
                                        url=response.url,
                                        content=response)
             self.redis.setVariable(hash_val, response.url)
             [self.URLset.put(link) for link in Links.parse_link(response)]
예제 #10
0
    def download(self, keyword):
        source = Links()
        print('Collecting downloadable links of {}...'.format(keyword))
        links = source.collect(keyword)

        print(
            'Downloading images of {} from collected links...'.format(keyword))
        self.mkdir('{}/{}'.format(self.path, keyword))
        n_links = len(links)
        for index, link in enumerate(links):
            try:
                print(
                    'Downloading this image based on the keyword {} from {}: {}/{}'
                    .format(keyword, link, index + 1, n_links))
                response = requests.get(link, stream=True)
                ext = self.get_extension(link)
                raw_path = '{}/{}/{}'.format(self.path, keyword,
                                             str(index).zfill(4))
                path = raw_path + '.' + ext
                self.save(response, path)

                del response

                print("Validating image file")
                ext2 = self.validate(path)
                if ext2 is None:
                    print('Unreadable file - {}'.format(link))
                    os.remove(path)
                else:
                    if ext != ext2:
                        path2 = raw_path + '.' + ext2
                        os.rename(path, path2)
                        print('Renaming extension {} -> {}'.format(ext, ext2))
            except Exception as e:
                print('Download failed.', e)
                continue
예제 #11
0
파일: app.py 프로젝트: dmtrbrlkv/LinksApi
def visited_links():
    try:
        json = request.json
    except Exception as e:
        return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST)

    try:
        links = Links.from_json(json)
    except Exception as e:
        return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST)

    try:
        redis_connector.add(links.links, now_timestamp())
    except Exception as e:
        return make_json_response(error=ERROR_INTERNAL, code=HTTPStatus.INTERNAL_SERVER_ERROR)

    return make_json_response()
예제 #12
0
def merge_clusters(elements, layer):
    merged = []
    elem_in_layer = []
    elem_other = []
    for elem in elements:
        if elem.layer == layer:
            elem_in_layer.append(elem)
        else:
            elem_other.append(elem)
    links = Links(elem_in_layer, distance)
    for group in links.groups.values():
        if len(group) == 1:
            merged.append(group[0])
            continue
        supercluster = None
        for cluster in group:
            if supercluster is None:
                supercluster = copy.copy(cluster)
                merged.append(supercluster)
                continue
            else:
                supercluster += cluster
    merged.extend(elem_other)
    return merged
예제 #13
0
    def _enhance_element_tree(self, e):
        for element in e.getiterator():
            for child in list(element):
                if len(element.findall(child.tag)) > 1:
                    setattr(element, child.tag, element.findall(child.tag))
                elif len(list(child)) == 0:
                    setattr(element, child.tag, child.text)
                else:
                    setattr(element, child.tag, element.find(child.tag))

        l = []
        for element in e.getiterator('link'):
            d = {
                'href': element.attrib.get('href'),
                'rel': element.attrib.get('rel'),
                'type': element.attrib.get('type') or 'application/xml'
            }

            l.append(d)

        e.links = lambda: Links(l)
        e.link = lambda x: e.links().get(x)

        return e
예제 #14
0
파일: queue.py 프로젝트: gsterjov/pyLoader
class Queue (object):
	'''
	The download queue
	'''

	def __init__ (self, builder, client):
		'''
		Constructor
		'''
		self.client = client

		# load the application settings
		self.settings = Gio.Settings.new ("org.pyLoader.queue")

		self.links = Links (builder, client)
		self.tree = builder.get_object ("queue_tree")
		
		# create the item store (packages)
		self.store = Gtk.ListStore (Package.__gtype__)
		self.store.set_sort_func (0, self.__store_compare, None)
		self.tree.set_model (self.store)
		

		# queue columns
		self.order_column		= builder.get_object ("queue_order")
		self.name_column		= builder.get_object ("queue_name")
		self.links_column		= builder.get_object ("queue_links")
		self.size_column		= builder.get_object ("queue_size")
		self.downloaded_column	= builder.get_object ("queue_downloaded")
		self.speed_column		= builder.get_object ("queue_speed")
		self.eta_column			= builder.get_object ("queue_eta")
		self.progress_column	= builder.get_object ("queue_progress")
		
		# create renderers
		order_renderer		= Gtk.CellRendererText()
		name_renderer		= Gtk.CellRendererText()
		links_renderer		= Gtk.CellRendererText()
		size_renderer		= Gtk.CellRendererText()
		downloaded_renderer	= Gtk.CellRendererText()
		speed_renderer		= Gtk.CellRendererText()
		eta_renderer		= Gtk.CellRendererText()
		progress_renderer	= Gtk.CellRendererProgress()
		
		# set column renderers
		self.order_column.pack_start (order_renderer, True)
		self.name_column.pack_start (name_renderer, True)
		self.links_column.pack_start (links_renderer, True)
		self.size_column.pack_start (size_renderer, True)
		self.downloaded_column.pack_start (downloaded_renderer, True)
		self.speed_column.pack_start (speed_renderer, True)
		self.eta_column.pack_start (eta_renderer, True)
		self.progress_column.pack_start (progress_renderer, True)
		
		self.order_column.set_cell_data_func (order_renderer, self.__render_order)
		self.name_column.set_cell_data_func (name_renderer, self.__render_name)
		self.links_column.set_cell_data_func (links_renderer, self.__render_links)
		self.size_column.set_cell_data_func (size_renderer, self.__render_size)
		self.downloaded_column.set_cell_data_func (downloaded_renderer, self.__render_downloaded)
		self.speed_column.set_cell_data_func (speed_renderer, self.__render_speed)
		self.eta_column.set_cell_data_func (eta_renderer, self.__render_eta)
		self.progress_column.set_cell_data_func (progress_renderer, self.__render_progress)


		# connect to ui events
		# self.tree.connect ("button-press-event", self.__on_button_press)

		selection = self.tree.get_selection()
		selection.connect ("changed", self.__on_selection_changed)

		# connect to client property events
		client.queue.added += self.__on_queue_added
		client.queue.changed += self.__on_queue_changed
		client.downloads.changed += self.__on_downloads_changed


		# load the queue column settings
		self.order_column.set_fixed_width (self.settings.get_uint ("column-order-size"))
		self.name_column.set_fixed_width (self.settings.get_uint ("column-name-size"))
		self.links_column.set_fixed_width (self.settings.get_uint ("column-links-size"))
		self.size_column.set_fixed_width (self.settings.get_uint ("column-size-size"))
		self.downloaded_column.set_fixed_width (self.settings.get_uint ("column-downloaded-size"))
		self.speed_column.set_fixed_width (self.settings.get_uint ("column-speed-size"))
		self.eta_column.set_fixed_width (self.settings.get_uint ("column-eta-size"))
		self.progress_column.set_fixed_width (self.settings.get_uint ("column-progress-size"))


	def save_state (self):
		self.settings.set_uint ("column-order-size", self.order_column.get_width())
		self.settings.set_uint ("column-name-size", self.name_column.get_width())
		self.settings.set_uint ("column-links-size", self.links_column.get_width())
		self.settings.set_uint ("column-size-size", self.size_column.get_width())
		self.settings.set_uint ("column-downloaded-size", self.downloaded_column.get_width())
		self.settings.set_uint ("column-speed-size", self.speed_column.get_width())
		self.settings.set_uint ("column-eta-size", self.eta_column.get_width())
		self.settings.set_uint ("column-progress-size", self.progress_column.get_width())


	def __render_order (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]
		cell.set_property ("text", "{0}".format (item.order))


	def __render_name (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]
		cell.set_property ("text", item.name)


	def __render_links (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]
		cell.set_property ("text", "{0}/{0} completed".format (item.links_done, item.links_total))


	def __render_size (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]

		total = utils.format_size (item.size_total)
		cell.set_property ("text", total)


	def __render_downloaded (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]

		if item.size_done > 0:
			total = utils.format_size (item.size_done)
			cell.set_property ("text", total)
		else:
			cell.set_property ("text", "")



	def __render_speed (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]

		if item.links_downloading:
			speed = 0
			downloads = self.client.downloads.value

			for link in item.links.itervalues():
				if downloads.has_key (link.id):
					speed += downloads[link.id].speed

			speed = utils.format_size (speed)
			cell.set_property ("text", "{0}/s".format (speed))

		else:
			cell.set_property ("text", "")


	def __render_eta (self, column, cell, model, iter, data):
		# get the item we are dealing with
		item = model[iter][0]

		# link is active
		if item.links_downloading:
			eta = 0
			downloads = self.client.downloads.value

			for link in item.links.itervalues():
				if downloads.has_key (link.id):
					eta += downloads[link.id].eta

			eta = utils.format_time (eta)
			cell.set_property ("markup", eta)

		# link is waiting
		elif not item.links_downloading and item.links_waiting:
			eta = None
			downloads = self.client.downloads.value

			for link in item.links.itervalues():
				if downloads.has_key (link.id):
					time_left = downloads[link.id].time_left

					if not eta: eta = time_left
					elif time_left < eta: eta = time_left

			eta = eta if eta > 0 else 0
			eta = utils.format_time (eta)
			cell.set_property ("markup", "<small>Waiting - {0}</small>".format (eta))

		# inactive link
		else:
			cell.set_property ("markup", "")
	
	
	def __render_progress (self, column, cell, model, iter, data):
		item = model[iter][0]

		percent = 0
		downloads = self.client.downloads.value

		for link in item.links.itervalues():
			if downloads.has_key (link.id):
				percent += downloads[link.id].percent

			elif link.status == Link.Status.FINISHED:
				percent += 100

		cell.set_property ("value", percent / len(item.links))
	

	def __store_compare (self, model, row1, row2, userdata):
		item1 = model[row1][0]
		item2 = model[row2][0]
		
		if item1.order < item2.order: return -1
		elif item1.order == item2.order: return 0
		else: return 1

	
	def __on_queue_added (self, prop, package):
		'''
		Handler to show newly added packages from the server
		'''
		parent = self.store.append ([package])


	def __on_queue_changed (self, prop, package):
		self.tree.queue_draw()


	def __on_downloads_changed (self, property, value):
		self.tree.queue_draw()



	def __on_button_press (self, widget, event):
		'''
		Handler to show the popup menu in the queue
		'''
		if event.type == Gdk.EventType.BUTTON_PRESS and event.button == 3:
			# get the current selection to determine which popup to use
			path, column, cell_x, cell_y = self.queue_tree.get_path_at_pos (event.x, event.y)
			iter = self.store.get_iter (path)

			# show the right context
			if iter and self.store[iter][0].is_link:
				link = self.store[iter][0]

				if link.offline:
					self.link_menu_failed.popup (None, None, None, None, event.button, event.time)

				elif link.active:
					self.link_menu_active.popup (None, None, None, None, event.button, event.time)

		return False


	def __on_selection_changed (self, selection):
		model, iter = selection.get_selected()
		package = model[iter][0]
		self.links.load (package)
def head_for_server(domain, url):
    target_url = domain + "/" + url
    print(target_url)
    headers = {}
    r = requests.head(target_url, 
                      headers=headers, 
                      allow_redirects=True, 
                      timeout=10)         

    return {
        "url": r.url,
        "headers": r.headers
    }

while True:
    unvisited_links = Links.get_unvisited_links(conn)
    
    if len(unvisited_links) == 0:
        print("Nothing to crawl, going to sleep")
        time.sleep(5)
        continue

    for link in unvisited_links:
        print("Going to {}".format(link["url"]))
        try:
            result = head_for_server(link["domain"], link["url"]) 
            print("Got result for {}. It is {}".format(link["url"], result["url"])) 
            Servers.insert_server(conn, link["link_id"], result["url"], result["headers"]["Server"])
        except:
            pass
    
예제 #16
0
파일: queue.py 프로젝트: gsterjov/pyLoader
	def __init__ (self, builder, client):
		'''
		Constructor
		'''
		self.client = client

		# load the application settings
		self.settings = Gio.Settings.new ("org.pyLoader.queue")

		self.links = Links (builder, client)
		self.tree = builder.get_object ("queue_tree")
		
		# create the item store (packages)
		self.store = Gtk.ListStore (Package.__gtype__)
		self.store.set_sort_func (0, self.__store_compare, None)
		self.tree.set_model (self.store)
		

		# queue columns
		self.order_column		= builder.get_object ("queue_order")
		self.name_column		= builder.get_object ("queue_name")
		self.links_column		= builder.get_object ("queue_links")
		self.size_column		= builder.get_object ("queue_size")
		self.downloaded_column	= builder.get_object ("queue_downloaded")
		self.speed_column		= builder.get_object ("queue_speed")
		self.eta_column			= builder.get_object ("queue_eta")
		self.progress_column	= builder.get_object ("queue_progress")
		
		# create renderers
		order_renderer		= Gtk.CellRendererText()
		name_renderer		= Gtk.CellRendererText()
		links_renderer		= Gtk.CellRendererText()
		size_renderer		= Gtk.CellRendererText()
		downloaded_renderer	= Gtk.CellRendererText()
		speed_renderer		= Gtk.CellRendererText()
		eta_renderer		= Gtk.CellRendererText()
		progress_renderer	= Gtk.CellRendererProgress()
		
		# set column renderers
		self.order_column.pack_start (order_renderer, True)
		self.name_column.pack_start (name_renderer, True)
		self.links_column.pack_start (links_renderer, True)
		self.size_column.pack_start (size_renderer, True)
		self.downloaded_column.pack_start (downloaded_renderer, True)
		self.speed_column.pack_start (speed_renderer, True)
		self.eta_column.pack_start (eta_renderer, True)
		self.progress_column.pack_start (progress_renderer, True)
		
		self.order_column.set_cell_data_func (order_renderer, self.__render_order)
		self.name_column.set_cell_data_func (name_renderer, self.__render_name)
		self.links_column.set_cell_data_func (links_renderer, self.__render_links)
		self.size_column.set_cell_data_func (size_renderer, self.__render_size)
		self.downloaded_column.set_cell_data_func (downloaded_renderer, self.__render_downloaded)
		self.speed_column.set_cell_data_func (speed_renderer, self.__render_speed)
		self.eta_column.set_cell_data_func (eta_renderer, self.__render_eta)
		self.progress_column.set_cell_data_func (progress_renderer, self.__render_progress)


		# connect to ui events
		# self.tree.connect ("button-press-event", self.__on_button_press)

		selection = self.tree.get_selection()
		selection.connect ("changed", self.__on_selection_changed)

		# connect to client property events
		client.queue.added += self.__on_queue_added
		client.queue.changed += self.__on_queue_changed
		client.downloads.changed += self.__on_downloads_changed


		# load the queue column settings
		self.order_column.set_fixed_width (self.settings.get_uint ("column-order-size"))
		self.name_column.set_fixed_width (self.settings.get_uint ("column-name-size"))
		self.links_column.set_fixed_width (self.settings.get_uint ("column-links-size"))
		self.size_column.set_fixed_width (self.settings.get_uint ("column-size-size"))
		self.downloaded_column.set_fixed_width (self.settings.get_uint ("column-downloaded-size"))
		self.speed_column.set_fixed_width (self.settings.get_uint ("column-speed-size"))
		self.eta_column.set_fixed_width (self.settings.get_uint ("column-eta-size"))
		self.progress_column.set_fixed_width (self.settings.get_uint ("column-progress-size"))
예제 #17
0
def main(args):
    global links

    links = Links()

    app.run(host='0.0.0.0', debug=args.debug, port=args.port)
예제 #18
0

def head_for_server(domain, url):
    target_url = domain + "/" + url
    print(target_url)
    headers = {}
    r = requests.head(target_url,
                      headers=headers,
                      allow_redirects=True,
                      timeout=10)

    return {"url": r.url, "headers": r.headers}


while True:
    unvisited_links = Links.get_unvisited_links(conn)

    if len(unvisited_links) == 0:
        print("Nothing to crawl, going to sleep")
        time.sleep(5)
        continue

    for link in unvisited_links:
        print("Going to {}".format(link["url"]))
        try:
            result = head_for_server(link["domain"], link["url"])
            print("Got result for {}. It is {}".format(link["url"],
                                                       result["url"]))
            Servers.insert_server(conn, link["link_id"], result["url"],
                                  result["headers"]["Server"])
        except:
예제 #19
0
 def links(self):
     r = self._link_header_to_array()
     return Links(r)
예제 #20
0
 def links(self):
     """
     Returns the Links of the header
     """
     r = self._link_header_to_array()
     return Links(r)
예제 #21
0
    for link in soup.find_all("a"):
        href = link.get("href")

        if href is None:
            continue

        if "start.bg" in href and "javascript:" not in href:
            inbound.add(href)
        elif "link.php" in href:
            outbound.add(href)
        else:
            others.add(href)

    return {"inbound": inbound, "outbound": outbound, "others": others}


while True:
    domains_to_visit = Domains.get_all_unvisited_domains(conn)

    if len(domains_to_visit) == 0:
        break

    for domain_row in domains_to_visit:
        result = collect_domain(domain_row["domain"])

    domain_id = domain_row["domain_id"]

    Domains.visit_domain(conn, domain_id)
    Domains.insert_domains(conn, result["inbound"])
    Links.insert_links(conn, result["outbound"], domain_id)
예제 #22
0
파일: bot.py 프로젝트: abatilo/cscq_bot
import praw
from linkextractor import LinkExtractor
from history import History
from logger import Logger
from links import Links

AGENT = 'Web scraper for /r/cscareerquestions. Made using PRAW'

reddit = praw.Reddit(AGENT)
history = History()
log = Logger()
link_file = Links()

log.write('Getting subreddit...')
cscareerquestions = reddit.get_subreddit('cscareerquestions')

if cscareerquestions:
  
  log.write('Getting comments...')
  comments = cscareerquestions.get_comments()
  
  log.write('Getting submissions...')
  submissions = cscareerquestions.get_hot(limit = 10)
  
  log.write('Loading history...')
  all_history = history.get_history()
  
  log.write('Looping through recent comments...')
  for comment in comments:
    id = comment.id
    log.write('Handling comment ' + str(id) + '...')
예제 #23
0
#! /usr/bin/python

from links import Links

if __name__ == '__main__':
    l = Links()
    l.mainloop()
 def setup(self):
     self.links = Links()
     self.main_page = MainPage(self.driver)
     self.data = Data()
     self.driver.get(self.links.landing)
예제 #25
0
    return {
        "inbound": inbound,
        "outbound": outbound,
        "others": others
    }


while True:
    domains_to_visit = Domains.get_all_unvisited_domains(conn)  
    
    if len(domains_to_visit) == 0:
        break

    for domain_row in domains_to_visit:
        result = collect_domain(domain_row["domain"])
    
    domain_id = domain_row["domain_id"]
    
    Domains.visit_domain(conn, domain_id)
    Domains.insert_domains(conn, result["inbound"])
    Links.insert_links(conn, result["outbound"], domain_id)
    
    print("Sleeping for 20 seconds ZzZzz")
    time.sleep(20)






예제 #26
0
 def __init__(self, driver, timeout=10):
     self.driver = driver
     self.driver.implicitly_wait(timeout)
     self.links = Links()