示例#1
0
def get_ratio_of_discr(df: pd.DataFrame, admissibles, outcome, sensitive):
    """
    Compute adjusted Ratio of Observational Discrimination
    """
    dom_sensitive = sorted(get_domain(df, [sensitive]))
    dom_outcome = sorted(get_domain(df, [outcome]))

    if len(dom_sensitive) != 2 or len(dom_outcome) != 2:
        return np.array([1.0])

    s0 = dom_sensitive[0]
    s1 = dom_sensitive[1]
    o0 = dom_outcome[0]
    o1 = dom_outcome[1]

    cont_matrix_iter = contingency_matrices_iterator(df, outcome, sensitive,
                                                     admissibles)

    rods = []
    for cm in cont_matrix_iter:
        if cm.shape == (2, 2):
            cb = cm[s0][o1] * cm[s1][o0]
            ad = cm[s0][o0] * cm[s1][o1]
            if ad != 0:
                rods.append(cb / ad)
        else:
            rods.append(1.0)

    rods = np.array(rods)
    return rods
示例#2
0
	def get_source_node(self):
		if 'source_selector' in self.options:
			if self.options['source_selector']:
				nodes = self.doc.cssselect(self.options['source_selector'])
				if len(nodes) == 1:
					return nodes[0]
				for node in nodes:
					res = self.has_source(node)
					if res is not None:
						return res

		body = self.doc.find('body')
		if body is None:
			return None

		for node in body.iter():
			res = self.has_source(node)
			if res is not None:
				return res

		domain = get_domain(self.url)
		for a in self.doc.iter('a'):
			link = a.get('href')
			if link and link.startswith('http') \
					and get_domain(link) != domain:
				text = self.get_block_text(a)
				if len(text) > 2 \
						and text.endswith(u'报') \
						and not text.endswith(u'举报'):
					return a
示例#3
0
    def get_source_node(self):
        if 'source_selector' in self.options:
            if self.options['source_selector']:
                nodes = self.doc.cssselect(self.options['source_selector'])
                if len(nodes) == 1:
                    return nodes[0]
                for node in nodes:
                    res = self.has_source(node)
                    if res is not None:
                        return res

        for node in self.doc.find('body').iter():
            res = self.has_source(node)
            if res is not None:
                return res

        domain = get_domain(self.url)
        for a in self.doc.iter('a'):
            link = a.get('href')
            if link and link.startswith('http') \
              and get_domain(link) != domain:
                text = self.get_block_text(a)
                if len(text) > 2 \
                  and text.endswith(u'报') \
                  and not text.endswith(u'举报'):
                    return a
示例#4
0
	def get_source_node(self):
		if self.options.get('source_selector', ''):
			nodes = self.article.select(self.options['source_selector'])
			if len(nodes) == 1:
				return nodes[0]
			for node in nodes:
				res = self.has_source(node)
				if res is not None:
					return res

		for node in self.article.doc.find('body').iter():
			res = self.has_source(node)
			if res is not None:
				return res

		domain = get_domain(self.article.url)
		for a in self.article.doc.iter('a'):
			link = a.get('href')
			if link and link.startswith('http') \
					and get_domain(link) != domain:
				text = self.article.get_block_text(a)
				if len(text) >= 2:
					if text.endswith(u'报') and not text.endswith(u'举报') \
							or text[-2:] in CATES and len(text) == 4:
						return a
示例#5
0
def learn_page(request, theme_name=None, template='theme_page.html'):
    theme = get_object_or_404(Theme, name=theme_name)
    theme.layers = theme.layer_set.all().order_by('name')
    context = {
        'theme': theme,
        'domain': get_domain(8000),
        'domain8010': get_domain()
    }
    return render_to_response(template, RequestContext(request, context))
示例#6
0
def data_needs(request, template='needs.html'):
    themes = Theme.objects.all().order_by('display_name')
    ordered_themes, theme_dict = add_ordered_needs_lists(themes)
    context = {
        'themes': themes,
        'theme_dict': theme_dict,
        'ordered_themes': ordered_themes,
        'domain': get_domain(8000),
        'domain8010': get_domain()
    }
    return render_to_response(template, RequestContext(request, context))
示例#7
0
def data_needs(request, template="needs.html"):
    themes = Theme.objects.all().order_by("display_name")
    ordered_themes, theme_dict = add_ordered_needs_lists(themes)
    context = {
        "themes": themes,
        "theme_dict": theme_dict,
        "ordered_themes": ordered_themes,
        "domain": get_domain(8000),
        "domain8010": get_domain(),
    }
    return render_to_response(template, RequestContext(request, context))
示例#8
0
def csw_listing(request, template='pycsw_catalog_view.html'):
  if logger:
    logger.info("Start csw_listing")
  csw_recs = pycsw_records.objects.using('pycsw_test').all().order_by('organization')
  html_id = 0
  for rec in csw_recs:
    rec.html_id = html_id
    html_id += 1
  context = {'records': csw_recs, 'domain': get_domain(8000), 'domain8010': get_domain()}
  if logger:
    logger.info("End csw_listing")
  return render_to_response(template, RequestContext(request, context))
示例#9
0
def topic_page(request, topic_name=None, template='topic_page.html'):
    topic = get_object_or_404(Topic, name=topic_name)
    views = MapView.objects.filter(topic=topic).order_by('ordering')
    viewsList = simplejson.dumps([view.name for view in views])
    layers = topic.layers.all().order_by('name')
    context = {
        'topic': topic,
        'views': views,
        'views_list': viewsList,
        'initial_view': views[0].name,
        'layers': layers,
        'domain': get_domain(8000),
        'domain8010': get_domain()
    }
    return render_to_response(template, RequestContext(request, context))
示例#10
0
    def get_har(
            self,
            remove_domain_request=True,
            domains_to_remove={
                'facebook.com', 'facebook.it', 'youtube.it', 'youtube.com',
                'twitter.it', 'twitter.com'
            },
            file_type_to_remove={'jpg', 'png', 'jpeg'}):
        result = list()
        if self.logging and self.logs:
            domain = None
            if remove_domain_request:
                domain = utils.get_domain(self.current_url)
            for log in self.logs:
                message = json.load(StringIO(log['message']))['message']
                if 'method' in message:
                    method = message['method']
                    if method and method == 'Network.responseReceived':
                        url = message['params']['response']['url']
                        if utils.is_valid_url(url):
                            to_insert = (domain and not utils.is_domain_link(
                                url, domain)) or domain is None
                            to_insert = to_insert and utils.get_filetype_from_url(
                                url) not in file_type_to_remove
                            if to_insert:
                                for d in domains_to_remove:
                                    if utils.is_domain_link(url, d):
                                        to_insert = False
                                        break
                                if to_insert:
                                    result.append(url)

        result = list(set(result))
        #print('har len: ' + str(len(result)))
        return result
示例#11
0
 def __init__(self, link, base_url):
     self.text = self.get_text(link)
     self.class_ = self.get_class(link)
     self.href = self.get_href(link, base_url)
     self.domain = get_domain(self.href)
     self.parent = link.parent
     self.base_url = base_url
示例#12
0
	def add(self, cate):
		url = cate['url']

		domain = get_domain(url)
		subdomains = get_subdomains(url)
		paths = get_path(url).split('/')
		query = urlparse.urlparse(url).query

		if domain not in self.root:
			self.root[domain] = {'sub':{}, 'path':{}}

		node = self.root[domain]
		if len(subdomains) > 1 or len(subdomains) == 1 and subdomains[0] != 'www':
			for sub in subdomains:
				if sub not in node['sub']:
					node['sub'][sub] = {'sub':{}, 'path':{}}
				node = node['sub'][sub]

		for path in paths:
			if path not in node['path']:
				node['path'][path] = {'path':{}}
			node = node['path'][path]

		if query:
			node['path']['query___' + query] = {'path':{}}
			node = node['path']['query___' + query]

		node['cate'] = cate
示例#13
0
 def bookmark_link(self):
     if not self.bookmark and self.is_sublayer and self.parent.bookmark:
         return self.parent.bookmark.replace('<layer_id>', str(self.id))
     if not self.bookmark:
         domain = get_domain(8000)
         return '%s/planner/#%s' % (domain, self.slug)
     return self.bookmark
示例#14
0
 def bookmark_link(self):
     if not self.bookmark and self.is_sublayer and self.parent.bookmark:
         return self.parent.bookmark.replace('<layer_id>', str(self.id))
     if not self.bookmark:
         domain = get_domain(8000)
         return '%s/planner/#%s' %(domain, self.slug)
     return self.bookmark
示例#15
0
def deploy_prometheus_route():
    '''Deploy Prometheus Route'''
    topic = 'Prometheus Operator Route'
    src_file = os.path.join(os.getcwd(),\
            "deploy/monitoring/prometheus/assisted-installer-ocp-prometheus-route.yaml")
    dst_file = os.path.join(os.getcwd(),\
            "build/assisted-installer-ocp-prometheus-route.yaml")
    try:
        # I have permissions
        ingress_domain = utils.get_domain()
    except:
        # I have not permissions, yes it's ugly...
        # This ingress should be there because of UI deployment
        json_path_ingress = '{.spec.rules[0].host}'
        cmd = "{} get ingress assisted-installer -o jsonpath='{}'".format(
            CMD_BIN, json_path_ingress)
        assisted_installer_ingress_domain = utils.check_output(cmd)
        if assisted_installer_ingress_domain.split(
                ".")[0] != 'assisted-installer':
            print("Error recovering the ingress route")
            sys.exit(1)

        ingress_domain = assisted_installer_ingress_domain.split(".",
                                                                 maxsplit=1)[1]
    with open(src_file, "r") as src:
        with open(dst_file, "w+") as dst:
            data = src.read()
            data = data.replace("INGRESS_DOMAIN", ingress_domain)
            print("Deploying {}: {}".format(topic, dst_file))
            dst.write(data)
    utils.apply(dst_file)
示例#16
0
def top_things(db_file):
    urls = {}
    people = {}
    graph = {}

    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database. (Top things)"

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        split = get_domain(loaded_rec)

        if urls.get(split, False) == False:
            urls[split] = 1
        else:
            urls[split] = urls[split] + 1

        person = loaded_rec['person']
        if people.get(person, False) == False:
            people[person] = 1
        else:
            people[person] = people[person] + 1

        if split is not None and split is not "" and \
            person is not None and person is not "":
            # Build a crazy relational graph out of my nosql data
            if graph.get(split, False) == False:
                graph[split] = {
                    "is_person": False,
                    "data": [person],
                    "linked_to_count": 1
                }
            elif person not in graph[split]:
                graph[split]["data"].append(person)
                graph[split][
                    "linked_to_count"] = graph[split]["linked_to_count"] + 1

            if graph.get(person, False) == False:
                graph[person] = {"is_person": True, "data": [split]}
            elif split not in graph[person]:
                graph[person]["data"].append(split)

        cur.step_back()
    cur.disable()
    db.close()

    def get_one(x):
        return x[1]

    return (sorted(urls.items(), key=get_one, reverse=True),
            sorted(people.items(), key=get_one, reverse=True), graph)
示例#17
0
def main():
    deploy_options = handle_arguments()
    # TODO: delete once rename everything to assisted-installer
    if deploy_options.target == "oc-ingress":
        service_host = "assisted-installer.{}".format(
            utils.get_domain(deploy_options.domain))
        service_port = "80"
    else:
        service_host = utils.get_service_host(
            SERVICE, deploy_options.target, namespace=deploy_options.namespace)
        service_port = utils.get_service_port(
            SERVICE, deploy_options.target, namespace=deploy_options.namespace)

    with open(SRC_FILE, "r") as src:
        with open(DST_FILE, "w+") as dst:
            data = src.read()
            data = data.replace("REPLACE_URL", '"{}"'.format(service_host))
            data = data.replace("REPLACE_PORT", '"{}"'.format(service_port))
            data = data.replace("REPLACE_DOMAINS",
                                '"{}"'.format(deploy_options.base_dns_domains))
            data = data.replace('REPLACE_NAMESPACE', deploy_options.namespace)
            print("Deploying {}".format(DST_FILE))

            versions = {
                "IMAGE_BUILDER": "installer-image-build",
                "AGENT_DOCKER_IMAGE": "agent",
                "KUBECONFIG_GENERATE_IMAGE":
                "ignition-manifests-and-kubeconfig-generate",
                "INSTALLER_IMAGE": "assisted-installer",
                "CONTROLLER_IMAGE": "assisted-installer-controller",
                "CONNECTIVITY_CHECK_IMAGE": "connectivity_check",
                "INVENTORY_IMAGE": "inventory"
            }
            for env_var_name, image_short_name in versions.items():
                image_fqdn = deployment_options.get_image_override(
                    deploy_options, image_short_name, env_var_name)
                versions[env_var_name] = image_fqdn

            # Edge case for controller image override
            if os.environ.get("INSTALLER_IMAGE"
                              ) and not os.environ.get("CONTROLLER_IMAGE"):
                versions[
                    "CONTROLLER_IMAGE"] = deployment_options.IMAGE_FQDN_TEMPLATE.format(
                        "assisted-installer-controller",
                        deployment_options.get_tag(
                            versions["INSTALLER_IMAGE"]))

            versions["SELF_VERSION"] = deployment_options.get_image_override(
                deploy_options, "assisted-service", "SERVICE")
            deploy_tag = get_deployment_tag(deploy_options)
            if deploy_tag:
                versions["RELEASE_TAG"] = deploy_tag

            y = yaml.load(data)
            y['data'].update(versions)
            data = yaml.dump(y)
            dst.write(data)

    utils.apply(DST_FILE)
 def get_allowed_from(self, child_urls):
     """
     :param child_urls: List of child urls to check robots.txt on
     :return: A list of allowed child urls to crawl
     """
     allowed = []
     domains = list(set('{0}'.format(get_domain(url)) for url in child_urls))
     domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains}
     for domain in domain_to_children:
         try:
             rules = self.robots.fetch(domain)
             for url in domain_to_children[domain]:
                 if rules.allowed(url, self._agent):
                     allowed.append(url)
         except:
             allowed.extend(domain_to_children[domain])
     return allowed
示例#19
0
def introspect(domain):
    filter_func = lambda x: get_domain(loads(x[1])).lower() in domain.lower()
    pages, requested_page = get_effective_page(request.args.get("page", 0),
            filter_func)
    items = get_items(filter_func, g.db_file, requested_page)

    return render_template("index.html", items=items, pages=pages,
            requested_page=requested_page, current_page=request.args.get('page', 0))
示例#20
0
def is_image_link(url):
	if url.split('.')[-1] in img_extensions:
		return True 
	domain = get_domain(url).split('.')
	for sharer in img_sharers:
		if sharer in domain: 
			return True 
	return False 
示例#21
0
def add_learn_links(themes):
    context = []
    domain = get_domain()
    for theme in themes:
        link = '%s/portal/learn/%s' %(domain, linkify(theme.name))
        #print link
        context.append({'theme': theme, 'learn_link': link})
    return context
示例#22
0
 def get_error_rate(self, response):
     self.out_domains.add(get_domain(response.request.url))
     self.crawler.stats.inc_value("no_requests")
     if not self.domain.check_request_url(response.request.url):
         self.crawler.stats.inc_value('no_new_posts')
     self.sum_download_time += response.meta['request_time']
     urls = [response.urljoin(url.strip()) for url in response.xpath("//a/@href").getall() if fix_url(url)]
     for url in urls:
         yield Request(url=url, callback=self.get_error_rate, errback=self.check_error_back_rate)
示例#23
0
    def __init__(self, name, url):
        self.name = name
        self.url = url.replace(' ','')  
        self.html = self.get_html()
        self.domain = get_domain(self.url) # e.g. stackoverflow.com
        self.base_url = self.get_base_url() # e.g. http://stackoverflow.com/
        self.all_links = self.get_all_links()
        self.some_valid_links = self.get_some_valid_links()

        self.valid_links = self.get_all_valid_links()
示例#24
0
def top_things(db_file):
    urls = {}
    people = {}
    graph = {}

    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database. (Top things)"

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        split = get_domain(loaded_rec)

        if urls.get(split, False) == False:
            urls[split] = 1
        else:
            urls[split] = urls[split] + 1

        person = loaded_rec['person']
        if people.get(person, False) == False:
            people[person] = 1
        else:
            people[person] = people[person] + 1

        if split is not None and split is not "" and \
            person is not None and person is not "":
            # Build a crazy relational graph out of my nosql data
            if graph.get(split, False) == False:
                graph[split] = {"is_person": False, "data": [person], "linked_to_count": 1}
            elif person not in graph[split]:
                graph[split]["data"].append(person)
                graph[split]["linked_to_count"] = graph[split]["linked_to_count"] + 1

            if graph.get(person, False) == False:
                graph[person] = {"is_person": True, "data": [split]}
            elif split not in graph[person]:
                graph[person]["data"].append(split)

        cur.step_back()
    cur.disable()
    db.close()

    def get_one(x):
        return x[1]

    return (sorted(urls.items(), key=get_one, reverse=True),
            sorted(people.items(), key=get_one, reverse=True),
            graph)
示例#25
0
    def __init__(self,
                 url_list,
                 save_base_dir,
                 header,
                 encoding=None,
                 grab_out_site_link=False,
                 to_single_page=False,
                 full_site=False,
                 ref_model=False,
                 framework=None):
        """

        :param url_list:
        :param save_base_dir:
        :param header:
        :param encoding:
        :param grab_out_site_link:
        :param to_single_page:
        :param full_site:
        :param ref_model:
        """
        self.parent_save_dir = save_base_dir
        self.date_str = get_date()
        self.zip_save_base_abs_dir = f"{self.parent_save_dir}/{config.template_archive_dir}/{self.date_str}/"  # zip /xx/xx/archive/2019-00-01/
        self.download_temp_abs_dir = f"{self.parent_save_dir}/{config.template_temp_dir}/{self.date_str}/"
        self.domain = get_domain(url_list[0])
        self.tpl_dl_dir, self.js_dir, self.img_dir, self.css_dir, self.other_dir = self.__prepare_dirs(
        )
        self.dl_urls = {}  # 去重使用,存储 url=>磁盘绝对路径
        self.error_grab_resource = {}  # 记录 http url => disk url ,最后生成一个报告打包
        self.header = header
        self.charset = encoding
        self.is_grab_outer_link = grab_out_site_link
        self.is_ref_model = ref_model
        if self.is_ref_model:
            self.is_grab_outer_link = False  # 盗链模式下,一定不抓外部的资源,内部资源也会被改写绝对路径
        self.is_to_single_page = to_single_page  # 是否把图片,css, js等压缩到一个页面里
        self.single_page = []  #report生成时使用
        self.is_full_site = full_site  #是否是整站
        self.html_link_queue = Queue()  # html 页面的队列
        for u in url_list:
            self.html_link_queue.put(u)
        self.downloaded_html_url = [
        ]  # 已经下载过的html,保存(disk_path, file_name, url, ), 最后用这个重新修正链接
        self.download_queue = Queue(
        )  # 数据格式json  {'cmd':quit/download, "url":'http://baidu.com', "save_path":'/full/path/file.ext', 'type':'bin/text'}
        self.download_finished = False  # url消耗完毕不代表网络请求都返回了
        self.task_finished = False  # 全部网络都返回, eventloop结束
        self.zip_result_file = None
        self.file_name_dup_checker = {
        }  # file_name => url 。用于检查生成的文件名字是否有重复的,如果重复了就要重新生成了
        self.framework_support = framework
        self.thread = threading.Thread(target=self.__download_thread)
        self.thread.start()
示例#26
0
 def __init__(self, input, **options):
     self.input = input
     self.url = options.get('url', '')
     self.debug = options.get('debug', False)
     self.title = options.get('title', '^^')
     self.pages = options.get('pages', None)
     self.texts = options.get('texts', None)
     self.domain = get_domain(self.url)
     self.options = options
     self.doc = clean_html(input, return_doc=True)
     self.text = self.doc.text_content()
     self.len = word_count(self.text) if self.text else 0
示例#27
0
	def __init__(self, input, **options):
		self.input = input
		self.url = options.get('url', '')
		self.debug = options.get('debug', False)
		self.title = options.get('title', '^^')
		self.pages = options.get('pages', None)
		self.texts = options.get('texts', None)
		self.domain = get_domain(self.url)
		self.options = options
		self.doc = clean_html(input, return_doc=True)
		self.text = self.doc.text_content()
		self.len = word_count(self.text) if self.text else 0
示例#28
0
    def get_cookie_syncs_for_multiple_sites(self,
                                            sites,
                                            cookie_length=8,
                                            filepath=''):
        """Get cookie syncing data for multiple sites, and write results to disk.
        
        Cookies must be at least cookie_length characters long to be considered.
        """
        sites = self._filter_site_list(sites)
        cookie_sync_data = defaultdict(defaultdict)
        for site in sites:
            cookie_sync_data[site] = self.get_cookie_syncs_by_site(
                site, cookie_length=cookie_length)

        # Write complete output as csv
        with open(os.path.join(filepath, 'full_cookie_syncs.csv'), 'w') as f:
            writer = csv.writer(f)
            writer.writerow(
                ['site', 'sending_domain', 'receiving_url', 'cookie_value'])
            for site in cookie_sync_data:
                for receiving_url in cookie_sync_data[site]:
                    for sending_url, cookie_value in cookie_sync_data[site][
                            receiving_url]:
                        writer.writerow(
                            [site, sending_url, receiving_url, cookie_value])

        # Write partial output as CSV, only identifying sending domain and receiving domain
        # (rather than the full receiving URL)

        cooks_just_domains = defaultdict(defaultdict)
        for site in cookie_sync_data:
            cooks_just_domains[site] = defaultdict(set)
            for receiving_url in cookie_sync_data[site]:
                for sending_domain, value in cookie_sync_data[site][
                        receiving_url]:
                    cooks_just_domains[site][utils.get_domain(
                        receiving_url)].add(sending_domain)
        with open(os.path.join(filepath, 'condensed_cookie_syncs.csv'),
                  'w') as f:
            writer = csv.writer(f)
            writer.writerow(['site', 'sending_domain', 'receiving_domain'])
            for site in cooks_just_domains:
                for receiving_domain in cooks_just_domains[site]:
                    if len(cooks_just_domains[site][receiving_domain]
                           ) > 1 and 'NOT_FOUND' in cooks_just_domains[site][
                               receiving_domain]:
                        cooks_just_domains[site][receiving_domain].discard(
                            'NOT_FOUND')
                    for sending_domain in cooks_just_domains[site][
                            receiving_domain]:
                        writer.writerow(
                            [site, sending_domain, receiving_domain])
示例#29
0
    def __init__(self, function):
        super(LowAnalyzer, self).__init__()

        self.function = function

        # Dominio
        self.domain = get_domain(self.function)

        # Raíces
        self.roots = get_roots(self.function)

        # Signo
        self.negative, self.positive = get_sign(self.function)
示例#30
0
    def render_template(self,
                        template_path,
                        template_context,
                        to_string=False):
        """
        Render a Template to output
        """
        from modules.events.internal import api as events_api

        # Debug - Show what non-js search engines see
        template_context['no_client'] = bool(
            self.request.get('no_client', False))

        # TODO: This needs to abstract the jinja env out further...
        from main import JINJA_ENVIRONMENT

        template_context['settings_dict'] = {}
        template_context['settings_dict']['is_appspot'] = is_appspot()
        template_context['settings_dict']['domain'] = get_domain()

        # Tack on the google analytics profiles
        if is_appspot():  # TODO: Remove this before being prod ready
            template_context['settings_dict'][
                'ga_profile_id'] = 'UA-54271335-1'
        else:
            template_context['settings_dict'][
                'ga_profile_id'] = 'UA-54271335-2'

        # Temporary Serverside rendering handler - this should be done via jina extensions likely
        template_context[
            'upcoming_event_resources'] = events_api.get_upcoming_event_resources(
            )
        template_context[
            'ongoing_event_resources'] = events_api.get_ongoing_event_resources(
            )

        # TODO: This should come from some sort of middleware likely
        template_context['settings_dict']['is_authenticated'] = bool(
            users.get_current_user())

        template_context['settings'] = json.dumps(
            template_context['settings_dict'])

        template = JINJA_ENVIRONMENT.get_template(template_path)

        rendered_content = template.render(template_context)

        if to_string:
            return rendered_content

        self.response.write(rendered_content)
示例#31
0
def tiles_page(request, slug=None, template='tiles_page.html'):
    layer = get_object_or_404(Layer, slug_name=slug)
    orig_url = layer.url
    arctile_url = orig_url.replace('{z}',
                                   '{level}').replace('{x}', '{col}').replace(
                                       '{y}', '{row}')
    arcrest_url = orig_url.replace('/export', '')
    context = {
        'layer': layer,
        'arctile_url': arctile_url,
        'arcrest_url': arcrest_url,
        'domain': get_domain(8000)
    }
    return render_to_response(template, RequestContext(request, context))
 def fetch_from(self, urls):
     """
     :param urls: A list of urls to fetch sitemaps of
     :return: A list of urls that was found within each sitemap of given urls
     """
     unique_domains = list(set(get_domain(u) for u in urls))
     sitemaps = self._try_fetch_sitemaps(unique_domains)
     results = []
     for url in sitemaps:
         sitemaps_content = self.requests_getter.get_content_from(sitemaps[url])
         for content in sitemaps_content:
             locations = self.sitemap_url_extractor.extract_from(content)
             locations = filter(lambda u: not u.endswith('.xml'), locations)
             results.extend(locations)
     return results
示例#33
0
def article():
    url = request.args.get('url')

    article = mongo.article.find_one({'_id': url})

    if not article:
        try:
            html = get_or_cache(url)
            article = html2article(html, url, selector=True, merge=True)
            if article and not article['src_name']:
                article['src_name'] = get_domain(url)

            tpl = url2tpl(url)
            urls = html2urls(html, url)
            texts = dict(
                map(lambda x: (x[0], max(x[1], key=lambda y: len(y))),
                    urls.iteritems()))
            tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys()))

            urls = {}
            for u, t in tmp.iteritems():
                if u != url and t == tpl:
                    urls[u] = texts[u]
                    if len(urls) >= 10:
                        break

            if article:
                article['urls'] = urls
                article['_id'] = url
                article['view'] = 1
                article['last'] = time.time()

                copy = article.copy()
                copy['urls'] = json.dumps(copy['urls'])
                mongo.article.save(copy)
        except:
            pass
    else:
        article['urls'] = json.loads(article['urls'])
        mongo.article.update({'_id': url},
                             {'$set': {
                                 'view': article['view'] + 1
                             }})

    if article:
        article['pubtime'] = article['pubtime'][:10]

    return render_template('extract/article.html', article=article, url=url)
示例#34
0
def deploy_grafana_route():
    # Deploy Grafana Route
    topic = 'Grafana Route'
    src_file = os.path.join(
        os.getcwd(),
        "deploy/monitoring/grafana/assisted-installer-ocp-grafana-route.yaml")
    dst_file = os.path.join(os.getcwd(),
                            "build/assisted-installer-ocp-grafana-route.yaml")
    ingress_domain = utils.get_domain()
    with open(src_file, "r") as src:
        with open(dst_file, "w+") as dst:
            data = src.read()
            data = data.replace("INGRESS_DOMAIN", ingress_domain)
            print("Deploying {}: {}".format(topic, dst_file))
            dst.write(data)
    utils.apply(dst_file)
def sortFrontier(frontier,  domainurl_count):
    inlinks_count = [len(i.inlinks) for i in frontier]
    inlinks_score = normalize(inlinks_count)

    domain_count = [domainurl_count[get_domain(i.url)] for i in frontier]
    domain_score = normalize(domain_count)

    keyword_score = match_keywords(frontier)

    for idx,obj in enumerate(frontier):
        final_score = inlinks_score[idx] + domain_score[idx] + keyword_score[idx]
        obj.score = final_score
    sortedList = sorted(frontier, key=lambda obj: obj.score, reverse=True)

    logging.info("Returning sorted list of objects (top 30){}".format([(obj.url, obj.score) for obj in sortedList[:30]]))
    return sortedList
示例#36
0
def domain_out_domains():
    with open('domain_out_domains.jsonl', mode='w') as f:
        for file_name in os.listdir('url_outlinks'):
            with open(f"url_outlinks/{file_name}") as fp:
                domains = json.load(fp)
                for domain, url_outlinks in domains.items():
                    out_domains_set = set()
                    for outlinks in url_outlinks.values():
                        for outlink in outlinks:
                            out_domains_set.add(get_domain(outlink))
                    f.write(
                        f"{json.dumps({'domain': domain, 'out_domains': list(out_domains_set)})}\n"
                    )

                fp.close()
            f.flush()
        f.close()
示例#37
0
 def fetch_stories(self, correlation_id=-1):
     """Fetches new stories from the datasource. Uses the last story external id to 
     fetch only new stories."""
     try:
         url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password)
         tweets = urllib.urlopen(url).read()
         tweets = json.loads(tweets)
         print tweets
         for key in tweets:
             try :
                 authors = []
                 authors.append(tweets[key])
                 self.add_read_story(key, authors)
                 self.add_user(tweets[key])
             except:
                 log_event("fetch_stories_failed", "AgentCell", self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id)
     except:
         log_event("fetch_stories_failed", "AgentCell", self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
示例#38
0
def article():
	url = request.args.get('url')

	article = mongo.article.find_one({'_id':url})

	if not article:
		try:
			html = get_or_cache(url)
			article = html2article(html, url, selector=True, merge=True)
			if article and not article['src_name']:
				article['src_name'] = get_domain(url)

			tpl = url2tpl(url)
			urls = html2urls(html, url)
			texts = dict(map(lambda x: (x[0], max(x[1], key=lambda y:len(y))), urls.iteritems()))
			tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys()))

			urls = {}
			for u, t in tmp.iteritems():
				if u != url and t == tpl:
					urls[u] = texts[u]
					if len(urls) >= 10:
						break

			if article:
				article['urls'] = urls
				article['_id'] = url
				article['view'] = 1
				article['last'] = time.time()

				copy = article.copy()
				copy['urls'] = json.dumps(copy['urls'])
				mongo.article.save(copy)
		except:
			pass
	else:
		article['urls'] = json.loads(article['urls'])
		mongo.article.update({'_id':url}, {'$set':{'view':article['view'] + 1}})

	if article:
		article['pubtime'] = article['pubtime'][:10]

	return render_template('extract/article.html', article=article, url=url)
示例#39
0
    def crawl(self, url, max_page_depth=5, max_external_sites_page_depth=4, request_rate_limit=4):
        """
        Will crawl a given url up to max_page_depth and max_external_sites_page_depth on a max rate of
        request_rate_limit.
        :param url: The to-be crawled url
        :param max_page_depth: Max internal (same-domain) depth
        :param max_external_sites_page_depth: Max external (different-domain) depth
        :param request_rate_limit: Up to n requests at once
        :return: List of Url objects (See schemas/url.py)
        """
        self._url_scanner.set_request_limit(request_rate_limit)
        self._max_page_depth = max_page_depth
        self._max_external_sites_page_depth = max_external_sites_page_depth
        self._domain = get_domain(url)

        self._internal_urls_to_scan.append(url)
        self._crawl_internal_urls()
        self._crawl_external_urls()
        return self._get_crawled_urls()
示例#40
0
def main():
    # TODO: delete once rename everything to assisted-installer
    if args.target == "oc-ingress":
        service_host = "assisted-installer.{}".format(
            utils.get_domain(args.domain))
        service_port = "80"
    else:
        service_host = utils.get_service_host(SERVICE, args.target)
        service_port = utils.get_service_port(SERVICE, args.target)
    with open(SRC_FILE, "r") as src:
        with open(DST_FILE, "w+") as dst:
            data = src.read()
            data = data.replace("REPLACE_URL", '"{}"'.format(service_host))
            data = data.replace("REPLACE_PORT", '"{}"'.format(service_port))
            print("Deploying {}".format(DST_FILE))

            if args.deploy_tag is not "":
                versions = {
                    "IMAGE_BUILDER": "quay.io/ocpmetal/installer-image-build:",
                    "AGENT_DOCKER_IMAGE": "quay.io/ocpmetal/agent:",
                    "KUBECONFIG_GENERATE_IMAGE":
                    "quay.io/ocpmetal/ignition-manifests-and-kubeconfig-generate:",
                    "INSTALLER_IMAGE": "quay.io/ocpmetal/assisted-installer:",
                    "CONNECTIVITY_CHECK_IMAGE":
                    "quay.io/ocpmetal/connectivity_check:",
                    "INVENTORY_IMAGE": "quay.io/ocpmetal/inventory:",
                    "HARDWARE_INFO_IMAGE": "quay.io/ocpmetal/hardware_info:",
                    "SELF_VERSION": "quay.io/ocpmetal/installer-image-build:"
                }
                versions = {
                    k: v + args.deploy_tag
                    for k, v in versions.items()
                }
                y = yaml.load(data)
                y['data'].update(versions)
                data = yaml.dump(y)
            else:
                y = yaml.load(data)
                y['data'].update({"SELF_VERSION": os.environ.get("SERVICE")})
                data = yaml.dump(y)
            dst.write(data)

    utils.apply(DST_FILE)
def read_seeds(seedfile):
    logging.info("Reading the seeds")
    frontier_map = {}  # url: url object
    frontier = {}      # waveno : list of url objects
    domain_urlcount = {}
    wave_no = 1
    with open(seedfile, "r") as f:
        for line in f:
            line = line.strip()
            url = clean_url(line)
            if validators.url(url) is True:
                obj = URL(url, wave_no)
                frontier_map[url] = obj
                if wave_no not in frontier:
                    frontier[wave_no] = []
                frontier[wave_no].append(obj)

                domain_urlcount = checkDomain(get_domain(obj.url), domain_urlcount)

    return frontier_map, frontier, domain_urlcount
示例#42
0
def deploy_grafana_route():
    '''Deploy Grafana Route'''
    topic = 'Grafana Route'
    src_file = os.path.join(os.getcwd(),\
            'deploy/monitoring/grafana/assisted-installer-ocp-grafana-route.yaml')
    dst_file = os.path.join(os.getcwd(),\
            'build', deploy_options.namespace, 'assisted-installer-ocp-grafana-route.yaml')
    try:
        # I have permissions
        ingress_domain = utils.get_domain(target=deploy_options.target,
                                          namespace=deploy_options.namespace,
                                          profile=deploy_options.profile)
    except:
        # I have not permissions, yes it's ugly...
        # This ingress should be there because of UI deployment
        json_path_ingress = '{.spec.rules[0].host}'
        cmd = "{} -n {} get ingress assisted-installer -o jsonpath='{}'".format(
            CMD_BIN, deploy_options.namespace, json_path_ingress)
        assisted_installer_ingress_domain = utils.check_output(cmd)
        if assisted_installer_ingress_domain.split(
                ".")[0] != 'assisted-installer':
            print("Error recovering the ingress route")
            sys.exit(1)
        ingress_domain = assisted_installer_ingress_domain.split(".",
                                                                 maxsplit=1)[1]

    with open(src_file, "r") as src:
        with open(dst_file, "w+") as dst:
            data = src.read()
            data = data.replace("INGRESS_DOMAIN", ingress_domain)
            data = data.replace('REPLACE_NAMESPACE',
                                f'"{deploy_options.namespace}"')
            print("Deploying {}: {}".format(topic, dst_file))
            dst.write(data)
    utils.apply(target=deploy_options.target,
                namespace=deploy_options.namespace,
                profile=deploy_options.profile,
                file=dst_file)
示例#43
0
 def extract_essence(self, correlation_id):
     """
     Analyze the story text, to extract the essence from it. For the essence, look for a matching StoryEssence cell.
     If found, link the story cell to the StoryEssence cell. Else create a new StoryEssence cell & link the story to it.
     """
     try:
         print "extract_essence called for story '%s'" % self.core
         client = Client()
         response = client.get('http://%s/text_analyzer/extract_essence/' % get_domain(), {'text': self.core}).content
         print "got extract essence response: ", response
         if response != "":
             try :
                 self.add_essence(response)
             except:
                 print sys.exc_info()
                 print "essence=", response
                 log_event("extract_essence_failed", STORY_CELL, self.id, "Adding essence '%s' extracted from story '%s' failed" % (response, self.core), correlation_id)
             # all went all, update the flag
             self.is_essence_extracted = True
             self.save()
     except:
         print "Failed to extract essence", sys.exc_info()
         log_event("extract_essence_failed", STORY_CELL, self.id, "Failed to extract essence from story '%s'" % self.core, correlation_id)
示例#44
0
 def fetch_stories(self, correlation_id=-1):
     """Fetches new stories from the datasource. Uses the last story external id to 
     fetch only new stories."""
     try:
         #url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password)
         #tweets = urllib.urlopen(url).read()
         client = Client()
         tweets = client.get('http://%s/twitter_sensor/' % get_domain(), {'user': self.user.user_name, 'password': self.user.user_password}).content
         tweets = json.loads(tweets)
         print tweets
         for key in tweets:
             try :
                 authors = []
                 for story in StoryCell.objects.all():
                     if story.core == key:
                         return
                 authors.append(tweets[key][0])
                 self.add_read_story(key, authors)
                 self.add_user(tweets[key][0])
             except:
                 log_event("fetch_stories_failed", AGENT_CELL, self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id)
     except:
         print "Failed to fetch stories", sys.exc_info()
         log_event("fetch_stories_failed", AGENT_CELL, self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
示例#45
0
 def tiles_link(self):
     if self.is_shareable and self.layer_type in ['XYZ', 'ArcRest', 'WMS']:
         domain = get_domain(8000)
         return '%s/explore/%s' %(domain, self.slug)
     return None
示例#46
0
 def extract_concepts(self, correlation_id):
     """
     Analyze the story text, to extract named entities. For each named entity, look for a matching Concept cell.
     If found, link the concept cell to the story. Else create a new concept cell & link the story to it.
     """
     try:
         client = Client()
         response = client.get('http://%s/text_analyzer/extract_named_entities/' % get_domain(), {'text': self.core}).content
         if response != "[]":
             named_entities = json.loads(response)
             for ne in named_entities:
                 try :
                     self.add_concept(ne)
                 except:
                     print sys.exc_info()
                     log_event("extract_concepts_failed", STORY_CELL, self.id, "Adding concept '%s' extracted from story '%s' failed" % (ne[0], self.core), correlation_id)
             # all went all, update the flag
             self.is_concepts_extracted = True
             self.save()
     except:
         print "Failed to extract concepts", sys.exc_info()
         log_event("extract_concepts_failed", STORY_CELL, self.id, "Failed to extract concepts from story '%s'" % self.core, correlation_id)
示例#47
0
 def filter_func(x):
     return get_domain(loads(x[1])).lower() in domain.lower()
示例#48
0
        # Create A set of results based upon this result set - iterator??
        ctx['posts'] = entities

        rss_content = self.render_template('./templates/newsfeeds/rss.html', ctx, to_string=True)

        # Set Cache
        ubercache.cache_set(cache_key, rss_content, category='written')

        self.response.headers['Content-Type'] = 'application/xml'
        self.response.write(rss_content)
        return


# Rest Controllers
resource_url = 'http://' + get_domain() + '/api/posts/%s'
category_resource_url = 'http://' + get_domain() + '/api/post_categories/%s'


CATEGORY_REST_RULES = [
    ResourceIdField(output_only=True),
    ResourceUrlField(category_resource_url, output_only=True),
    SlugField(BlogCategory.slug, required=True),
    RestField(BlogCategory.title, required=True),
]


REST_RULES = [
    ResourceIdField(output_only=True),
    ResourceUrlField(resource_url, output_only=True),
    SlugField(BlogPost.slug, required=True),
示例#49
0
def data_catalog_bs3(request, template='bs3_catalog.html'):
    themes = Theme.objects.all().order_by('display_name')
    themes_with_links = add_learn_links(themes)
    add_ordered_layers_lists(themes_with_links)
    context = {'themes': themes_with_links, 'domain': get_domain(8000), 'domain8010': get_domain()}
    return render_to_response(template, RequestContext(request, context))
示例#50
0
def data_needs(request, template='needs.html'):
    themes = Theme.objects.all().order_by('display_name')
    ordered_themes, theme_dict = add_ordered_needs_lists(themes)
    context = {'themes': themes, 'theme_dict': theme_dict, 'ordered_themes': ordered_themes, 'domain': get_domain(8000), 'domain8010': get_domain()}
    return render_to_response(template, RequestContext(request, context)) 
示例#51
0
def get_user_stats(username, db_file):
    item = {
        "username": username,
        "aliases": [],
        "total_posts": 0,
        "domains": {},
        "first_post_date": None,
        "first_post_date_unix": None,
        "most_recent_post": None,
        "most_recent_post_unix": 0,
        "average_posts_per_hour": 0.0,
        "average_posts_per_day": 0.0,
        "average_posts_per_week": 0.0
    }

    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        if loaded_rec['person'] != username:
            cur.step()
            continue

        # Looks like this is a post by the user we're looking for
        split = get_domain(loaded_rec)

        if item['domains'].get(split, False) == False:
            item['domains'][split] = 1
        else:
            item['domains'][split] = item['domains'][split] + 1

        if item['first_post_date_unix'] is None:
            item['first_post_date_unix'] = loaded_rec['created_at']

        if item['most_recent_post_unix'] < loaded_rec['created_at']:
            item['most_recent_post_unix'] = loaded_rec['created_at']

        item['total_posts'] = item['total_posts'] + 1

        cur.step()

    cur.disable()
    db.close()

    # Clean up everything

    first_time = None
    if item['first_post_date_unix'] is not None:
        unix = float(item['first_post_date_unix'])
        first_time = datetime.fromtimestamp(unix)
        item['first_post_date'] = first_time.isoformat()

    recent_time = None
    if item['most_recent_post_unix'] is not None:
        unix = float(item['most_recent_post_unix'])
        recent_time = datetime.fromtimestamp(unix)
        item['most_recent_post'] = recent_time.isoformat()

    if first_time and recent_time:
        delta = recent_time - first_time
        item['user_age_days'] = delta.days
        item['user_age_seconds'] = delta.total_seconds()
        item['average_posts_per_hour'] = item['total_posts'] / (
            delta.total_seconds() / 60.0)
        item['average_posts_per_day'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0)
        item['average_posts_per_week'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0 / 7.0)

    return item
示例#52
0
def get_user_stats(username, db_file):
    item = {
        "username": username,
        "aliases": [],
        "total_posts": 0,
        "domains": {},
        "first_post_date": None,
        "first_post_date_unix": None,
        "most_recent_post": None,
        "most_recent_post_unix": 0,
        "average_posts_per_hour": 0.0,
        "average_posts_per_day": 0.0,
        "average_posts_per_week": 0.0
    }

    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        if loaded_rec['person'] != username:
            cur.step()
            continue

        # Looks like this is a post by the user we're looking for
        split = get_domain(loaded_rec)

        if item['domains'].get(split, False) == False:
           item['domains'][split] = 1
        else:
            item['domains'][split] = item['domains'][split] + 1

        if item['first_post_date_unix'] is None:
            item['first_post_date_unix'] = loaded_rec['created_at']

        if item['most_recent_post_unix'] < loaded_rec['created_at']:
            item['most_recent_post_unix'] = loaded_rec['created_at']

        item['total_posts'] = item['total_posts'] + 1

        cur.step()

    cur.disable()
    db.close()

    # Clean up everything

    first_time = None
    if item['first_post_date_unix'] is not None:
        unix = float(item['first_post_date_unix'])
        first_time = datetime.fromtimestamp(unix)
        item['first_post_date'] = first_time.isoformat()

    recent_time = None
    if item['most_recent_post_unix'] is not None:
        unix = float(item['most_recent_post_unix'])
        recent_time = datetime.fromtimestamp(unix)
        item['most_recent_post'] = recent_time.isoformat()

    if first_time and recent_time:
        delta = recent_time - first_time
        item['user_age_days'] = delta.days
        item['user_age_seconds'] = delta.total_seconds()
        item['average_posts_per_hour'] = item['total_posts'] / (delta.total_seconds() / 60.0)
        item['average_posts_per_day'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0)
        item['average_posts_per_week'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0 / 7.0)

    return item
示例#53
0
 def description_link(self):
     theme_name = self.themes.all()[0].name
     domain = get_domain(8000)
     return '%s/learn/%s#%s' %(domain, theme_name, self.slug)
示例#54
0
from rest.resource import RestField, SlugField, ResourceIdField, ResourceUrlField
from rest.resource import BooleanField, ResourceField
from rest.params import coerce_to_datetime
from rest.utils import get_key_from_resource_id

from files.rest_helpers import REST_RESOURCE_RULES as FILE_REST_RULES

from modules.events.internal import api as events_api
from modules.events.internal.models import Event
from modules.events.constants import CATEGORY, PRIMARY_IMAGE_PROP
from utils import ubercache

from cal.rest_helpers import EventDateField
from utils import get_domain

resource_url = 'http://' + get_domain() + '/api/events/%s'  # TODO: HRM?

# verbosity vs. input vs. output

REST_RULES = [
    ResourceIdField(output_only=True),
    ResourceUrlField(resource_url, output_only=True),
    SlugField(Event.slug, required=True),
    RestField(Event.name, required=True),
    RestField(Event.url, required=False),
    EventDateField(Event.event_dates, required=True),
    RestField(Event.content),
    RestField(Event.summary),
    BooleanField(Event.featured),
    RestField(Event.primary_image_resource_id, required=False),
    ResourceField(PRIMARY_IMAGE_PROP,
示例#55
0
def learn_page(request, theme_name=None, template='learn_page.html'):
    topics = Topic.objects.filter(active=True).order_by('ordering')
    context = {'topics': topics, 'domain': get_domain(8000), 'domain8010': get_domain()}
    return render_to_response(template, RequestContext(request, context)) 
示例#56
0
def topic_page(request, topic_name=None, template='topic_page.html'):
    topic = get_object_or_404(Topic, name = topic_name)
    views = MapView.objects.filter(topic=topic).order_by('ordering')
    viewsList = simplejson.dumps([view.name for view in views])
    layers = topic.layers.all().order_by('name')
    context = {'topic': topic, 'views': [views[0]], 'views_list': [viewsList[0]], 'initial_view': views[0].name, 'layers': layers, 'domain': get_domain(8000), 'domain8010': get_domain()}
    return render_to_response(template, RequestContext(request, context)) 
示例#57
0
 def is_filtered_jid(self, user_jid):
     if (self._whitelist and user_jid not in self._whitelist
             and get_domain(user_jid) not in self._whitelist):
         return True
示例#58
0
 def learn_link(self):
     domain = get_domain(8000)
     return '%s/learn/%s' %(domain, self.name)
示例#59
0
 def get_absolute_url(self):
     return "http://%s/cells/view/story/%d" % (get_domain(), self.id)