Python urljoin 예제들, urllib.parse.urlparse.urljoin Python 예제들

예제 #1

0

파일 보기

파일: basic.py 프로젝트: VinMing/scrapy_book

    def parse(self, response):
        # 创建 contract
        """ This function parase a property page.
        @url http://web:3912/properties/property_000000.html
        @returns items L
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """
        # 比较实用的水平爬取和垂直爬取URL
        # 水平
        urls = response.xpath('//*[contains(@class,"next")]//@href').extract()
        absUrls = [urlparse.urljoin(response.url, i) for i in urls]
        # 垂直
        urls = response.xpath('//*[@itemprop="url"]/@href').extract()
        absUrls = [urlparse.urljoin(response.url, i) for i in urls]
        # 使用爬虫预定义的方法log()，输出在基本字段表中总结的所有内容
        self.log("title: %s" % response.xpath('//*[@itemprop="name"][1]/text()').extract())
        self.log("price: %s" % response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+'))
        self.log("description: %s" % response.xpath('//*[@itemprop="description"][1]/text()').extract())
        self.log("address: %s" % response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract())
        self.log("image_urls: %s" % response.xpath('//*[@itemprop="image"][1]/@src').extract())

        # # 填充Item
        # item = PropertiesItem()
        # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract()
        # item['price'] = response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+')
        # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract()
        # item['address'] = response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract()
        # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract()
        # return item


        # 清理 item 装载器与管理字段
        L = ItemLoader(item=PropertiesItem(), response=response)
        L.add_xpath('title', '//*[@itemprop="name"][1]/text()')
        L.add_xpath('price', '//*[@itemprop="price"][1]/text()', re='[.0-9]+')
        L.add_xpath('description', '//*[@itemprop="description"][1]/text()')
        L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()')
        L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src')


        # 使用处理器 用来对我们的Xpath/CSS结果进行处理。
        # 在爬虫中使用几个这样子的处理器，并按照我们想要的方式输出
        L.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
        L.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+')
        L.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
        L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
        L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i : urlparse.urljoin(response.url, i)))

        # 使用add_value方法设置管理字段
        L.add_value('url', reponse.url)
        L.add_value('project', self.settings.get('BOT_NAME'))
        L.add_value('spider', self.name)
        L.add_value('server', socket.gethostname)
        L.add_value('date', datetime.datetime.now())
        return L.load_item()
        pass

예제 #2

0

파일 보기

파일: maven_utils.py 프로젝트: project-ncl/pnc-cli

def get_repo_url(mead_tag, nexus_base_url, prefix="hudson-", suffix=""):
    """
    Creates repository Nexus group URL composed of:
        <nexus_base_url>/content/groups/<prefix><mead_tag><suffix>

    :param mead_tag: name of the MEAD tag used to create the proxy URL in settings.xml
    :param nexus_base_url: the base URL of a Nexus instance
    :param prefix: Nexus group name prefix, default is "hudson-"
    :param suffix: Nexus group name suffix, e.g. "-jboss-central" or "-reverse"
    :returns:
    """
    result = urlparse.urljoin(nexus_base_url, "content/groups/")
    result = urlparse.urljoin(result, "%s%s%s/" % (prefix, mead_tag, suffix))
    return result

예제 #3

0

파일 보기

파일: maven_utils.py 프로젝트: tmckayus/pnc-cli

def get_repo_url(mead_tag, nexus_base_url, prefix="hudson-", suffix=""):
    """
    Creates repository Nexus group URL composed of:
        <nexus_base_url>/content/groups/<prefix><mead_tag><suffix>

    :param mead_tag: name of the MEAD tag used to create the proxy URL in settings.xml
    :param nexus_base_url: the base URL of a Nexus instance
    :param prefix: Nexus group name prefix, default is "hudson-"
    :param suffix: Nexus group name suffix, e.g. "-jboss-central" or "-reverse"
    :returns:
    """
    result = urlparse.urljoin(nexus_base_url, "content/groups/")
    result = urlparse.urljoin(result, "%s%s%s/" % (prefix, mead_tag, suffix))
    return result

예제 #4

0

파일 보기

파일: __init__.py 프로젝트: robotiste/python-bolidozor-postprocessing

    def _get_directory(self,
                       url,
                       pattern,
                       url_group=1,
                       value_group=2,
                       value_fn=None):
        response = self.request(url)

        if value_group is None:
            if value_fn is None:
                value_fn = lambda m, u: m.group(0)
        else:
            if value_fn is None:
                #value_fn = lambda v: v
                value_fn = int

        count = 0

        for match in pattern.finditer(response):
            count += 1
            match_url = urlparse.urljoin(url, match.group(url_group))
            if value_group is None:
                yield (
                    match_url,
                    value_fn(match, match_url),
                )
            else:
                yield (
                    match_url,
                    value_fn(match.group(value_group)),
                )

예제 #5

0

파일 보기

파일: utils.py 프로젝트: yarikoptic/rdflib

	def __init__(self, name) :
		"""
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""
		try :
			# Note the removal of the fragment ID. This is necessary, per the HTTP spec
			req = Request(url=name.split('#')[0])

			req.add_header('Accept', 'text/html, application/xhtml+xml')

			self.data		= urlopen(req)
			self.headers	= self.data.info()

			if URIOpener.CONTENT_LOCATION in self.headers :
				self.location = urlparse.urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
			else :
				self.location = name

		except urllib_HTTPError :
			e = sys.exc_info()[1]
			from pyMicrodata import HTTPError
			msg = BaseHTTPRequestHandler.responses[e.code]
			raise HTTPError('%s' % msg[1], e.code)
		except Exception :
			e = sys.exc_info()[1]
			from pyMicrodata import MicrodataError
			raise MicrodataError('%s' % e)

예제 #6

0

파일 보기

파일: client.py 프로젝트: Charnelx/python-webdav

    def download_file(self, file_path, dest_path='.'):
        """
        Download a file from file_path to dest_path

        :param file_path: Path to the resource to download
        :type file_path: String

        :param dest_path: Path to where the downloaded file should be saved
        :type dest_path: String

        """
        resource_path = urlparse.urljoin(self.connection.path,
                                         file_path.strip('/'))
        resp, content = self.connection.send_get(resource_path)
        file_name = os.path.basename(file_path)
        write_to_path = os.path.join(dest_path, file_name)

        try:
            file_fd = open(write_to_path, 'wb')
            file_fd.write(content)
        except IOError:
            raise
        finally:
            file_fd.close()

        return resp, content

예제 #7

0

파일 보기

파일: editionExtractor.py 프로젝트: NegarMirgati/ryuk

    def parse_editions_url(self, response):
        name = response.css(
            'div.mainContentFloat h1 a::text').extract_first().strip()
        editions = response.meta['editions']
        all_books_data = {}
        if (name not in editions['name']):
            editions['name'] = name
            editions['urls'] = []

        for book_page in response.css('a.bookTitle').xpath('@href'):
            editions['urls'].append((urlparse.urljoin(response.url,
                                                      book_page.extract())))

        next_page = response.xpath("//*[@rel='next']/@href").extract_first()

        if next_page is not None and (len(editions['urls']) <= 200):
            yield response.follow(next_page,
                                  callback=self.parse_editions_url,
                                  meta={'editions': editions})
        else:
            print(editions['name'])
            print(len(editions['urls']))
            for link in editions['urls']:
                request = scrapy.Request(link,
                                         callback=self.parse_editions_data,
                                         meta={
                                             'all_books_data': all_books_data,
                                             'name': editions['name']
                                         })
                yield request
                #yield editions
            editions = {}

예제 #8

0

파일 보기

def download_file(request, file_id):
    """permet a l'utilisateur de telecharger le fichier grace a l'api"""
    gi = request.galaxy
    data = gi.datasets.show_dataset(dataset_id=file_id)
    name = "error"
    if isinstance(data, dict):
        dlurl = data.get('download_url')
        name = data.get('name')
        name = name.replace(" ", "_")
        name = name + "." + data.get('file_ext')
        if not name:
            name = "download"
        if dlurl:
            url = urlparse.urljoin(gi.base_url, dlurl)
            response = urllib.urlopen(url)
            stream_response = StreamingHttpResponse(response.read())
            stream_response[
                'Content-Disposition'] = 'attachment; filename=' + name
        else:
            stream_response = StreamingHttpResponse(
                "No file download URL corresponds to the given dataset id " +
                file_id)

    else:
        stream_response = StreamingHttpResponse(data)
    return stream_response

예제 #9

0

파일 보기

파일: objects.py 프로젝트: ahknight/django-databrowse

 def model_view(self, request, model_databrowse, url):
     # If the object ID wasn't provided, redirect to the model page,
     # Which is one level up.
     if url is None:
         return http.HttpResponseRedirect(
             urlparse.urljoin(request.path, '../')
         )
     easy_model = EasyModel(
         model_databrowse.site,
         model_databrowse.model
     )
     try:
         obj = easy_model.object_by_pk(url)
     except ObjectDoesNotExist:
         raise http.Http404('Id not found')
     except ValueError:
         raise http.Http404('Invalid format key provided')
     return render(
         request,
         'databrowse/object_detail.html',
         {
             'object': obj,
             'root_url': model_databrowse.site.root_url
         }
     )

예제 #10

0

파일 보기

파일: utils.py 프로젝트: westurner/rdflib

    def __init__(self, name):
        """
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""
        try:
            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
            req = Request(url=name.split('#')[0])

            req.add_header('Accept', 'text/html, application/xhtml+xml')

            self.data = urlopen(req)
            self.headers = self.data.info()

            if URIOpener.CONTENT_LOCATION in self.headers:
                self.location = urlparse.urljoin(
                    self.data.geturl(),
                    self.headers[URIOpener.CONTENT_LOCATION])
            else:
                self.location = name

        except urllib_HTTPError:
            e = sys.exc_info()[1]
            from pyMicrodata import HTTPError
            msg = BaseHTTPRequestHandler.responses[e.code]
            raise HTTPError('%s' % msg[1], e.code)
        except Exception:
            e = sys.exc_info()[1]
            from pyMicrodata import MicrodataError
            raise MicrodataError('%s' % e)

예제 #11

0

파일 보기

파일: updater.py 프로젝트: skylning/Roam

def install_project(info, basefolder, serverurl, updateMode=False):
    if not serverurl:
        roam.utils.warning("No server url set for update")
        raise ValueError("No server url given")

    roam.utils.info("Downloading project zip")
    if updateMode:
        filename = "{}.zip".format(info['name'])
    else:
        filename = "{}-Install.zip".format(info['name'])

    serverurl = add_slash(serverurl)
    url = urlparse.urljoin(serverurl, "projects/{}".format(filename))

    tempfolder = os.path.join(basefolder, "_updates")
    if not os.path.exists(tempfolder):
        os.mkdir(tempfolder)

    zippath = os.path.join(tempfolder, filename)
    for status in download_file(url, zippath):
        yield status

    yield "Installing"
    with zipfile.ZipFile(zippath, "r") as z:
        z.extractall(basefolder)

    project = roam.project.Project.from_folder(os.path.join(basefolder, info['name']))

    os.chdir(project.folder)
    yield "Running update scripts.."
    run_install_script(project.settings, "after_update")
    project.projectUpdated.emit(project)

예제 #12

0

파일 보기

파일: get_ca_issuers.py 프로젝트: sajoku/cert-check

class CAIssuersParser:
    '''Parses list of CA's from Mozilla, Chrome, Opera, iOS.'''

    # https://en.wikipedia.org/wiki/Certificate_authority#Providers

    CA_LISTS = {
        'mozilla': {
            'list': urlparser.urljoin(
                'https://hg.mozilla.org/releases/mozilla-beta/raw-file/',
                'tip/security/nss/lib/ckfw/builtins/certdata.txt',
            ),
            'pattern': '# Issuer ',
        }
    }
    ISSUERS = []

    # TODO: parse the other lists and store the CA's into a file
    def parse_issuers(self):
        resp = requests.get(self.CA_LISTS['mozilla']['list'])
        raw_list = resp.text
        pattern = self.CA_LISTS['mozilla']['pattern']
        for line in raw_list.split('\n'):
            if line.startswith(pattern):
                issuer = line.lstrip(pattern)
                if issuer not in self.ISSUERS:
                    print(issuer)
                    self.ISSUERS.append(issuer)

예제 #13

0

파일 보기

파일: updater.py 프로젝트: skylning/Roam

    def fetch_data(self, rootfolder, filename, serverurl):
        """
        Download the update zip file for the project from the server
        """
        serverurl = add_slash(serverurl)

        tempfolder = os.path.join(rootfolder, "_updates")
        if not os.path.exists(tempfolder):
            os.mkdir(tempfolder)

        filename = "{}.zip".format(filename)
        url = urlparse.urljoin(serverurl, "projects/{}".format(filename))
        zippath = os.path.join(tempfolder, filename)
        if not self.check_url_found(url):
            yield "Skipping data download"
            yield "Done"
            return

        roam.utils.info("Downloading data zip from {}".format(url))
        try:
            for status in download_file(url, zippath):
                yield status
        except UpdateExpection as ex:
            roam.utils.exception("Error in update for project")
            yield "Error in downloading data"
            return

        yield "Extracting data.."
        with zipfile.ZipFile(zippath, "r") as z:
            members = z.infolist()
            for i, member in enumerate(members):
                z.extract(member, rootfolder)
                roam.utils.debug("Extracting: {}".format(member.filename))

        yield "Done"

예제 #14

0

파일 보기

파일: api.py 프로젝트: nmilliken/jenkinsapi

def install_artifacts(artifacts, dirstruct, installdir, basestaticurl):
        """
        Install the artifacts.
        """
        assert basestaticurl.endswith("/"), "Basestaticurl should end with /"
        installed = []
        for reldir, artifactnames in list(dirstruct.items()):
            destdir = os.path.join(installdir, reldir)
            if not os.path.exists(destdir):
                log.warn("Making install directory %s" % destdir)
                os.makedirs(destdir)
            else:
                assert os.path.isdir(destdir)
            for artifactname in artifactnames:
                destpath = os.path.abspath(os.path.join(destdir, artifactname))
                if artifactname in list(artifacts.keys()):
                    # The artifact must be loaded from jenkins
                    theartifact = artifacts[artifactname]
                else:
                    # It's probably a static file, we can get it from the static collection
                    staticurl = urlparse.urljoin(basestaticurl, artifactname)
                    theartifact = Artifact(artifactname, staticurl)
                theartifact.save(destpath)
                installed.append(destpath)
        return installed

예제 #15

0

파일 보기

def export_to_itol(request, file_id):
    # retrieve newick from galaxy server
    gi = request.galaxy
    data = gi.datasets.show_dataset(dataset_id=file_id)

    if isinstance(data, dict):
        dlurl = data.get('download_url')
        if dlurl:
            url = urlparse.urljoin(gi.base_url, dlurl)
            response = urllib.urlopen(url)
            tmpfile = tempfile.NamedTemporaryFile()
            tmpfile.write(response.read())
            tmpfile.flush()
            # send file to itol server
            url_itol = 'https://itol.embl.de/upload.cgi'
            payload = {
                'tname': "",
                'tfile': open(tmpfile.name, 'rb'),
            }
            r = requests.post(url_itol, files=payload)
            return redirect(r.url)
    return render(request, 'error.html', {
        'errortitle': 'Error querying galaxy',
        'errormessage': data
    })

예제 #16

0

파일 보기

 def parser(self, html):
     data = []
     if not html:
         return data
     soup = BeautifulSoup(html, "lxml")
     for tag in soup.find_all("a", class_="a_title2"):
         origin = urlparse.urljoin(self.root_url, tag['href'])
         sub_html = self.downloader(origin)
         sub_soup = BeautifulSoup(sub_html, 'lxml')
         detail = sub_soup.find('div', class_='detail_xq w770')
         title = detail.find('h2').get_text().strip()
         lis = detail.find_all('li')
         cnnvd_id = lis[0].get_text().strip()[8:]
         cve_id = lis[2].find_all('a')[-1].get_text().strip()
         description = sub_soup.find('div',
                                     class_='d_ldjj').get_text().strip()
         keyword = self.get_keyword(description)
         created = lis[6].find_all('a')[-1].get_text().strip()
         data.append({
             "title": title,
             "cnnvd_id": cnnvd_id,
             "cve_id": cve_id,
             "description": description,
             "keyword": keyword,
             "created": created,
             "origin": origin,
         })
     return data

예제 #17

0

파일 보기

파일: maven_utils.py 프로젝트: tmckayus/pnc-cli

def download_pom(repo_url=None, artifact=None, pom_url=None, target_dir=None):
    """
    Downloads a pom file with give GAV (as array) or from given pom_url and saves it as pom.xml into target_dir.

    :param repo_url: repository URL from which the pom should be downloaded, mandatory only if no pom_url provided
    :param artifact: MavenArtifact instance, mandatory only if no pom_url provided
    :param pom_url: URL of the pom to download, not mandatory
    :target_dir: target directory path, where the pom should be saved, not mandatory
    :returns: path to the saved pom, useful if no target_dir provided
    """
    if not pom_url:
        pom_url = urlparse.urljoin(
            repo_url, "%s/" % string.replace(artifact.groupId, ".", "/"))
        pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.artifactId)
        pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.version)
        pom_url = urlparse.urljoin(
            pom_url, "%s-%s.pom" % (artifact.artifactId, artifact.version))

    handler = None
    try:
        handler = urlopen(pom_url)
    except HTTPError as err:
        logging.error("Failed to download POM %s. %s", pom_url, err)
        return None

    if not target_dir:
        num = 1
        while not target_dir or os.path.exists(target_dir):
            target_dir = "/tmp/maven-temp-path-%s" % num
            num += 1

    pom_path = os.path.join(target_dir, "pom.xml")

    if handler.getcode() == 200:
        pom = handler.read()
        handler.close()
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        pom_file = None
        try:
            pom_file = open(pom_path, "w")
            pom_file.write(pom)
        finally:
            if pom_file:
                pom_file.close()

    return pom_path

예제 #18

0

파일 보기

파일: gplay.py 프로젝트: canhbkhn/google-play-apps-crawler-scrapy

 def abs_url(url, response):
     """Return absolute link"""
     base = response.xpath('//head/base/@href').extract()
     if base:
         base = base[0]
     else:
         base = response.url
     return urlparse.urljoin(base, url)

예제 #19

0

파일 보기

파일: html_parser.py 프로젝트: langlangkang/spider

 def _get_new_urls(self, page_url, soup):
     new_urls = {}  #定义字典
     links = soup.find_all('a', class_="e")  #找节点
     for link in links:
         new_url = link['href']
         new_full_url = urlparse.urljoin(page_url, new_url)  #拼接
         new_urls.add(new_full_url)  #添加
     return new_urls  #返回

예제 #20

0

파일 보기

파일: html_parser.py 프로젝트: Bob-Yuan/ithome

 def _get_new_urls(self, page_url, soup):
     new_urls = set()
     links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
     for link in links:
         new_url = link['href']
         new_full_url = urlparse.urljoin(page_url, new_url)
         new_urls.add(new_full_url)
     return new_urls

예제 #21

0

파일 보기

파일: maven_utils.py 프로젝트: project-ncl/pnc-cli

def download_pom(repo_url=None, artifact=None, pom_url=None, target_dir=None):
    """
    Downloads a pom file with give GAV (as array) or from given pom_url and saves it as pom.xml into target_dir.

    :param repo_url: repository URL from which the pom should be downloaded, mandatory only if no pom_url provided
    :param artifact: MavenArtifact instance, mandatory only if no pom_url provided
    :param pom_url: URL of the pom to download, not mandatory
    :target_dir: target directory path, where the pom should be saved, not mandatory
    :returns: path to the saved pom, useful if no target_dir provided
    """
    if not pom_url:
        pom_url = urlparse.urljoin(repo_url, "%s/" % string.replace(artifact.groupId, ".", "/"))
        pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.artifactId)
        pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.version)
        pom_url = urlparse.urljoin(pom_url, "%s-%s.pom" % (artifact.artifactId, artifact.version))

    handler = None
    try:
        handler = urlopen(pom_url)
    except HTTPError as err:
        logging.error("Failed to download POM %s. %s", pom_url, err)
        return None

    if not target_dir:
        num = 1
        while not target_dir or os.path.exists(target_dir):
            target_dir = "/tmp/maven-temp-path-%s" % num
            num += 1

    pom_path = os.path.join(target_dir, "pom.xml")

    if handler.getcode() == 200:
        pom = handler.read()
        handler.close()
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        pom_file = None
        try:
            pom_file = open(pom_path, "w")
            pom_file.write(pom)
        finally:
            if pom_file:
                pom_file.close()

    return pom_path

예제 #22

0

파일 보기

def get_city_name(url_city):
    '''获取当前所有的城市列表'''
    base_url = "http://www.tianqihoubao.com"
    city_url = urlparse.urljoin(base_url, url_city)
    # print city_url
    html = requests.get(city_url).content
    html_tree = etree.HTML(html)
    links = html_tree.xpath("//td//a/@href")
    return map(get_name, links)

예제 #23

0

파일 보기

파일: spider_struts.py 프로젝트: NaTsUk0/Ajatar

def _get_new_urls(page_url, links):
    new_urls = set()
    for link in links:
        new_url = link
        new_full_url = urlparse.urljoin(page_url, new_url)
        OO0o = urlparse(new_full_url)
        if OO0o.path.endswith(".action") or OO0o.path.endswith(".do"):
            new_urls.add(new_full_url)
    return new_urls

예제 #24

0

파일 보기

파일: worksheet_listing.py 프로젝트: sagemath/sagenb

def parse_link_rel(url, fn):
    """
    Read through html file ``fn`` downloaded from ``url``, looking for a
    link tag of the form:

    <link rel="alternate"
          type="application/sage"
          title="currently ignored"
          href=".../example.sws" />

    This function reads ``fn`` looking for such tags and returns a list
    of dictionaries of the form

    {'title': from title field in link, 'url': absolute URL to .sws file}

    for the corresponding ``.sws`` files. Naturally if there are no
    appropriate link tags found, the returned list is empty. If the HTML
    parser raises an HTMLParseError, we simply return an empty list.
    """
    from HTMLParser import HTMLParser

    class GetLinkRelWorksheets(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.worksheets = []

        def handle_starttag(self, tag, attrs):
            if (tag == 'link' and ('rel', 'alternate') in attrs
                    and ('type', 'application/sage') in attrs):
                self.worksheets.append({
                    'title': [_ for _ in attrs if _[0] == 'title'][0][1],
                    'url': [_ for _ in attrs if _[0] == 'href'][0][1]
                })

    parser = GetLinkRelWorksheets()
    with open(fn) as f:
        try:
            parser.feed(f.read())
        except HTMLParseError:
            return []

    ret = []
    for d in parser.worksheets:
        sws = d['url']
        # is that link a relative URL?
        if not urlparse.urlparse(sws).netloc:
            # unquote-then-quote to avoid turning %20 into %2520, etc
            ret.append({
                'url':
                urlparse.urljoin(url, urllib.quote(urllib.unquote(sws))),
                'title':
                d['title']
            })
        else:
            ret.append({'url': sws, 'title': d['title']})
    return ret

예제 #25

0

파일 보기

파일: html_parser.py 프로젝트: vkhLeslie/python-spider

 def _get_new_urls(self, page_url, soup):
     new_urls = set()
     #获取所有标签
     links = soup.find_all('a', href=re.compile(r'/pg/\d+\.htm'))
     for link in links:
         new_url = link['link']
         # 全路径
         new_full_url = urlparse.urljoin(page_url, new_url)
         new_urls.add(new_full_url)
         return new_urls

예제 #26

0

파일 보기

    def __query_implementation(self, identifier_name, value):
        action_name = 'Sale/Query/{0}={1}'.format(identifier_name, value)

        request_header = {
            "MerchantKey": str(self.merchant_key),
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
        return requests.get(urlparse.urljoin(self.host_uri, action_name),
                            headers=request_header)

예제 #27

0

파일 보기

파일: crawler.py 프로젝트: NaTsUk0/Ajatar

 def _get_new_urls(self, page_url, links):
     #添加爬取到的新url
     new_url = set()
     for link in links:
         new_url = link
         new_full_url = urlparse.urljoin(page_url, new_url)
         new_full_url = self.check_url(new_full_url)
         if (self._judge(new_full_url)):
             new_urls.add(new_full_url)
     return new_urls

예제 #28

0

파일 보기

 def retry_with_request(self, retry_sale_request):
     request_header = {
         "MerchantKey": str(self.merchant_key),
         'Content-Type': 'application/json',
         'Accept': 'application/json'
     }
     action_name = self.resource_name + '/Retry'
     return requests.post(urlparse.urljoin(self.host_uri, action_name),
                          data=json.dumps(retry_sale_request,
                                          cls=uuid_serialize),
                          headers=request_header)

예제 #29

0

파일 보기

def _init_request(path, params, headers, creds):
    credentials = creds or Config()
    hdrs = {
        'Accept': 'application/vnd.pagerduty+json;version=2',
        'Authorization': f'Token token={credentials["pagerduty"].api_key}'
    }
    if headers:
        hdrs.update(headers)
    params = params or {}
    url = urlparse.urljoin(PAGERDUTY_API_URL, path)
    return url, params, hdrs

예제 #30

0

파일 보기

def getDownloadList(url='http://www.yahoo.co.jp'):
    dom = pq(url)
    result = set()
    for img in dom('img').items():
        img_url = img.attr['src']
        if img_url.startswith('http'):
            result.add(img_url)
        else:
            result.add(urlparse.urljoin(url, img_url))

    return result

예제 #31

0

파일 보기

def _load_from_uri(uri, timeout=None, headers={}, cookies=None):
    response = requests.get(uri,
                            cookies=cookies,
                            timeout=timeout,
                            headers=headers)
    content = response.content.strip()
    parsed_url = urlparse.urlparse(uri)
    prefix = parsed_url.scheme + '://' + parsed_url.netloc
    base_path = os.path.normpath(parsed_url.path + '/..')
    base_uri = urlparse.urljoin(prefix, base_path)
    return M3U8(content, base_uri=base_uri), response.cookies

예제 #32

0

파일 보기

파일: mark_as_read.py 프로젝트: neethumoltp/migratemost

def get_arguments():
    global base_url
    global team_name
    global access_token

    parser = OptionParser(usage='''usage: %prog [options]
        Marks all the channels of all users of given team as read.
        Useful after using the Mattermost bulk import, as otherwise all users have tons of unread messages and
        the Mattermost client has a hard time loading.''')
    parser.add_option(
        "-b",
        "--base-url",
        dest="base_url",
        action="store",
        type="string",
        help=
        "Base URL of Mattermost installation (mandatory), e.g. 'https://mattermost.mycompany.ch/'"
    )
    parser.add_option(
        "-t",
        "--team",
        dest="team",
        action="store",
        type="string",
        help="Team name of which channels should be marked as read (mandatory)"
    )
    parser.add_option(
        "-a",
        "--access-token",
        dest="token",
        action="store",
        type="string",
        help=
        "A valid Mattermost API access token (optional, can be entered interactively)"
    )
    (options, args) = parser.parse_args()

    if options.base_url is None:
        parser.print_help()
        parser.error("Base URL parameter is mandatory")

    if options.team is None:
        parser.error("Team parameter is mandatory")

    if options.token is None:
        access_token = getpass.getpass('Mattermost API access token:')
    else:
        access_token = options.token

    base_url = urlparse.urljoin(options.base_url, '/api/v4')
    team_name = options.team
    print("team_name", team_name)
    print("base_url = ", base_url)
    print("options.base_url", options.base_url)

예제 #33

0

파일 보기

파일: __init__.py 프로젝트: robotiste/python-bolidozor-postprocessing

    def _get_months(self, year, type="snapshots"):
        url = urlparse.urljoin(self.base_url, "%s/%4d/" % (
            type,
            year,
        ))

        return self._get_directory(
            url,
            self.YEAR_RE,
            value_fn=lambda v: datetime.date(year, int(v), 1),
        )

예제 #34

0

파일 보기

def get_keep(request, article_id):
    logged_user = request.user
    article = Article.objects.get(id=article_id)
    articles = logged_user.article_set.all()
    if article not in articles:
        article.user.add(logged_user)  # for m2m linking, have tested by shell
        article.keep_num += 1
        article.save()
        return redirect('/focus/')
    else:
        url = urlparse.urljoin('/focus/', article_id)
        return redirect(url)

예제 #35

0

파일 보기

파일: views.py 프로젝트: defosk/django-shorturls

def redirect(request, prefix, tiny, converter=default_converter):
    """
    Redirect to a given object from a short URL.
    """
    # Resolve the prefix and encoded ID into a model object and decoded ID.
    # Many things here could go wrong -- bad prefix, bad value in 
    # SHORTEN_MODELS, no such model, bad encoding -- so just return a 404 if
    # any of that stuff goes wrong.
    try:
        app_label, model_name = settings.SHORTEN_MODELS[prefix].split('.')
    except KeyError:
        raise Http404('Bad prefix.')
    try:
        model = models.get_model(app_label, model_name)
    except LookupError:
        model = False
    if not model:
        raise Http404('Bad model specified in SHORTEN_MODELS.')
    try:
        id = converter.to_decimal(tiny)
    except ValueError:
        raise Http404('Bad encoded ID.')
    
    # Try to look up the object. If it's not a valid object, or if it doesn't
    # have an absolute url, bail again.
    obj = get_object_or_404(model, pk=id)
    try:
        url = obj.get_absolute_url()
    except AttributeError:
        raise Http404("'%s' models don't have a get_absolute_url() method." % model.__name__)
    
    # We might have to translate the URL -- the badly-named get_absolute_url
    # actually returns a domain-relative URL -- into a fully qualified one.
    
    # If we got a fully-qualified URL, sweet.
    if urlparse.urlsplit(url)[0]:
        return HttpResponsePermanentRedirect(url)
    
    # Otherwise, we need to make a full URL by prepending a base URL.
    # First, look for an explicit setting.
    if hasattr(settings, 'SHORTEN_FULL_BASE_URL') and settings.SHORTEN_FULL_BASE_URL:
        base = settings.SHORTEN_FULL_BASE_URL
        
    # Next, if the sites app is enabled, redirect to the current site.
    elif Site._meta.installed:
        base = 'http://%s/' % Site.objects.get_current().domain
        
    # Finally, fall back on the current request.
    else:
        base = 'http://%s/' % RequestSite(request).domain
        
    return HttpResponsePermanentRedirect(urlparse.urljoin(base, url))

예제 #36

0

파일 보기

파일: worksheet_listing.py 프로젝트: sagemath/sagenb

def parse_link_rel(url, fn):
    """
    Read through html file ``fn`` downloaded from ``url``, looking for a
    link tag of the form:

    <link rel="alternate"
          type="application/sage"
          title="currently ignored"
          href=".../example.sws" />

    This function reads ``fn`` looking for such tags and returns a list
    of dictionaries of the form

    {'title': from title field in link, 'url': absolute URL to .sws file}

    for the corresponding ``.sws`` files. Naturally if there are no
    appropriate link tags found, the returned list is empty. If the HTML
    parser raises an HTMLParseError, we simply return an empty list.
    """
    from HTMLParser import HTMLParser
    class GetLinkRelWorksheets(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.worksheets = []

        def handle_starttag(self, tag, attrs):
            if (tag == 'link' and
                ('rel', 'alternate') in attrs and
                ('type', 'application/sage') in attrs):
                self.worksheets.append({'title': [_ for _ in attrs if _[0] == 'title'][0][1],
                                          'url': [_ for _ in attrs if _[0] == 'href'][0][1]})

    parser = GetLinkRelWorksheets()
    with open(fn) as f:
        try:
            parser.feed(f.read())
        except HTMLParseError:
            return []

    ret = []
    for d in parser.worksheets:
        sws = d['url']
        # is that link a relative URL?
        if not urlparse.urlparse(sws).netloc:
            # unquote-then-quote to avoid turning %20 into %2520, etc
            ret.append({'url': urlparse.urljoin(url, urllib.quote(urllib.unquote(sws))),
                        'title': d['title']})
        else:
            ret.append({'url': sws, 'title': d['title']})
    return ret

예제 #37

0

파일 보기

파일: views.py 프로젝트: C3BI-pasteur-fr/NGPhylogeny_fr

def tree_visualization(request, file_id):

    gi = request.galaxy
    data = gi.datasets.show_dataset(dataset_id=file_id)
    if isinstance(data, dict):
        dlurl = data.get('download_url')
        historyid = data.get('history_id')
        if dlurl and historyid:
            url = urlparse.urljoin(gi.base_url, dlurl)
            response = urllib.urlopen(url)
            return render(request,
                          template_name='treeviz/tree.html',
                          context={'newick_tree': response.read(),
                                   'history_id': historyid})
    return render(request, 'error.html', {'errortitle': 'Error querying galaxy', 'errormessage': data})

예제 #38

0

파일 보기

파일: laurel_spider.py 프로젝트: esabelhaus/secret-octo-dubstep

 def second_pass(self, response):
     base_url = 'https://www.cityoflaurel.org'
     links_with_dupes = response.css('div#site-main')[0].css('section#site-content')[0].css('a').xpath('@href').extract()
     links = list(set(links_with_dupes))
     for link in links:
         print("LINK: " + link + '\n')
         if link.endswith('.pdf'):
             link = urlparse.urljoin(base_url, link)
             yield Request(url=link, callback=self.save_pdf)
         if "http" not in str(link):
             yield Request(url=base_url + link + '/', callback=self.third_pass)
         elif "cityoflaurel" in link:
             yield Request(url=link, callback=self.third_pass)
         else:
             yield { "link": link }

예제 #39

0

파일 보기

파일: __init__.py 프로젝트: jianhuashao/extraction

    def cleanup_url(self, value_url, source_url, mark):
        """
        Transform relative URLs into absolute URLs if possible.

        If the value_url is already absolute, or we don't know the
        source_url, then return the existing value. If the value_url is
        relative, and we know the source_url, then try to rewrite it.
        """
        value = urlparse.urlparse(value_url)
        if value.netloc or not source_url:
            url = value_url
        else:
            url = urlparse.urljoin(source_url, value_url)
        if url.startswith('//'):
            url = 'http:' + url # MissingSchema fix
        if mark:
            url = url + mark
        return url

예제 #40

0

파일 보기

파일: download_scan.py 프로젝트: GwadaLUG/download-manga-scan

    def download_scan(self, ignore_files=False):
        """ Téléchargement des scan

        TODO:
            Gérer les authentification HTTP
            Remplace subprocess par la librairie curl directement
        """
        for scan in self.list_pages_by_chapters():
            print(">> Téléchargement du chapitre {}".format(scan[0]))

            chapter_dir = "%s_%s" % (DEFAULT_SCAN_CHAPTER_DIRNAME, scan[0])

            # On créé le répertoire de destination
            chapter_path = self.create_dir(
                os.path.join(self.scan_path, self.scan_name, chapter_dir))

            for p in scan[1]:
                img_found = False
                page = "0%s" % p if p < 10 else p
                url = urlparse.urljoin(
                    DEFAULT_SCAN_URL,
                    "%s/%s/%s" % (self.scan_name, scan[0], page))

                # On cherche une URL valide en fonction de
                # l'extension des images
                for ext in DEFAULT_IMG_EXT:
                    dl_file = self.download_file(
                        "%s.%s" % (url, ext),
                        chapter_path,
                        ignore_files)
                    if dl_file:
                        print(">> Téléchargement de la page {}".format(page))
                        img_found = True
                        break

                """TODO: log dans un fichier
                Si on a pas du tout trouvé d'image on saute
                le téléchargement
                """
                if not img_found:
                    print("la page {} n'a pas été trouvé".format(page))
                    continue

        return

예제 #41

0

파일 보기

파일: views.py 프로젝트: C3BI-pasteur-fr/NGPhylogeny_fr

def export_to_itol(request, file_id):
    # retrieve newick from galaxy server
    gi = request.galaxy
    data = gi.datasets.show_dataset(dataset_id=file_id)

    if isinstance(data, dict):
        dlurl = data.get('download_url')
        if dlurl:
            url = urlparse.urljoin(gi.base_url, dlurl)
            response = urllib.urlopen(url)
            tmpfile = tempfile.NamedTemporaryFile()
            tmpfile.write(response.read())
            tmpfile.flush()
            # send file to itol server
            url_itol = 'https://itol.embl.de/upload.cgi'
            payload = {'tname': "", 'tfile': open(tmpfile.name, 'rb'), }
            r = requests.post(url_itol, files=payload)
            return redirect(r.url)
    return render(request, 'error.html', {'errortitle': 'Error querying galaxy', 'errormessage': data})

예제 #42

0

파일 보기

파일: download_scan.py 프로젝트: GwadaLUG/download-manga-scan

    def list_scan_chapters(self):
        """Parse la page du scan pour savoir combien de chapitre
        sont disponibles.

        TODO:
            Faire en sorte de gérer les chapitres bonus

        Returns:
            Nombre de chapitre trouvé
        """
        chapters = []
        url = urlparse.urljoin(DEFAULT_CHAPTER_URL, "%s/" % self.scan_name)
        if self.test_url(url):
            html = str(urlopen(url).read())
            tabs = re.findall(
                r'(<td class="td">)([A-Za-z0-9\-\ \:]+)(chapitre)\ ([0-9]+)',
                html)
            for t in tabs:
                chapters.append(t[3])
                print("chapitre {} trouvé".format(t[3]))
        return chapters

예제 #43

0

파일 보기

파일: views.py 프로젝트: C3BI-pasteur-fr/NGPhylogeny_fr

def download_file(request, file_id):
    """permet a l'utilisateur de telecharger le fichier grace a l'api"""
    gi = request.galaxy
    data = gi.datasets.show_dataset(dataset_id=file_id)
    name = "error"
    if isinstance(data, dict):
        dlurl = data.get('download_url')
        name = data.get('name')
        name = name.replace(" ","_")
        name = name + "." + data.get('file_ext')
        if not name:
            name = "download"
        if dlurl:
            url = urlparse.urljoin(gi.base_url, dlurl)
            response = urllib.urlopen(url)
            stream_response = StreamingHttpResponse(response.read())
            stream_response['Content-Disposition'] = 'attachment; filename=' + name
        else:
            stream_response = StreamingHttpResponse("No file download URL corresponds to the given dataset id " + file_id)

    else:
        stream_response = StreamingHttpResponse(data)
    return stream_response

예제 #44

0

파일 보기

파일: shorturl.py 프로젝트: kravciuk/django-shorturls

 def render(self, context):
     try:
         obj = self.obj.resolve(context)
     except template.VariableDoesNotExist:
         return ''
         
     try:
         prefix = self.get_prefix(obj)
     except (AttributeError, KeyError):
         return ''
     
     tinyid = converter.from_decimal(obj.pk)
             
     if hasattr(settings, 'SHORT_BASE_URL') and settings.SHORT_BASE_URL:
         return urlparse.urljoin(settings.SHORT_BASE_URL, prefix+tinyid)
     
     try:
         return urlresolvers.reverse('shorturls.views.redirect', kwargs = {
             'prefix': prefix,
             'tiny': tinyid
         })
     except urlresolvers.NoReverseMatch:
         return ''

예제 #45

0

파일 보기

파일: test_core.py 프로젝트: jefftriplett/django-social-auth

 def url(self, path):
     return urlparse.urljoin(settings.TEST_DOMAIN, path)