Exemplos de urlparse em Python, exemplos de urllib2.urlparse em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: getLinks.py Projeto: drugotosto/WebScraping

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html)
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

Exemplo n.º 2

0

Exibir arquivo

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

Exemplo n.º 3

0

Exibir arquivo

Arquivo: getLinks.py Projeto: drugotosto/WebScraping

def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

Exemplo n.º 4

0

Exibir arquivo

def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

Exemplo n.º 5

0

Exibir arquivo

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
    bsObj = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(bsObj,domain)
    externalLinks = getExternalLinks(bsObj,domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

Exemplo n.º 6

0

Exibir arquivo

 def domain(self, instance):
     url_instances = list(instance.urls.all())
     urls = [{
         'url': url.url,
         'category': url.category
     } for url in url_instances]
     other_urls = []
     for item in urls:
         if item['category'] == 'others':
             other_urls.append(item['url'])
     website = instance.website_set.all()
     if website.exists():
         website = list(website)
         website_url = website[0].url
         try:
             other_urls.remove(website_url)
         except ValueError:
             pass
     domains = []
     if other_urls:
         for item in other_urls:
             parsed_url = urlparse(item)
             domain = parsed_url.scheme + '://' + parsed_url.netloc
             domains.append({'url': item, 'domain': domain})
         return domains

Exemplo n.º 7

0

Exibir arquivo

def test_compare_triple_counts():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(
                    urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            # triple counts
            nt_fdp, nt_dump = len(g_fdp), len(g_dump)
            assert_equals(
                nt_fdp, nt_dump,
                'Triple counts differ: %d (FDP) vs. %d (ref)' %
                (nt_fdp, nt_dump))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: transports.py Projeto: magnitronus/xmpppy

 def __init__(self, endpoint, server=None, port=None, use_srv=True, wait=80,
         hold=4, requests=5, headers=None, PIPELINE=True, GZIP=True):
     PlugIn.__init__(self)
     self.DBG_LINE = 'bosh'
     self._exported_methods = [
         self.send, self.receive, self.disconnect,
     ]
     url = urlparse(endpoint)
     self._http_host = url.hostname
     self._http_path = url.path
     if url.port:
         self._http_port = url.port
     elif url.scheme == 'https':
         self._http_port = 443
     else:
         self._http_port = 80
     self._http_proto = url.scheme
     self._server = server
     self._port = port
     self.use_srv = use_srv
     self.Sid = None
     self._rid = 0
     self.wait = wait
     self.hold = hold
     self.requests = requests
     self._pipeline = None
     self.PIPELINE = PIPELINE
     if self.PIPELINE:
         self._respobjs = []
     else:
         self._respobjs = {}
     self.headers = headers or self.default_headers
     self.GZIP = GZIP

Exemplo n.º 9

0

Exibir arquivo

def test_compare_triples():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(
                    urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            both, first, second = graph_diff(g_fdp, g_dump)
            n_first = len(first)
            # n_second = len(second)
            # n_both = len(both)

            assert_equals(
                n_first, 0,
                '{} triple(s) different from reference:\n\n{}===\n{}\n'.format(
                    n_first, first.serialize(format='turtle'),
                    second.serialize(format='turtle')))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_fdp.py Projeto: NLeSC/ODEX-FAIRDataPoint

def test_compare_triples():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            both, first, second = graph_diff(g_fdp, g_dump)
            n_first = len(first)
            # n_second = len(second)
            # n_both = len(both)

            assert_equals(
               n_first, 0, '{} triple(s) different from reference:\n\n{}===\n{}\n'.format(
                  n_first, first.serialize(format='turtle'), second.serialize(format='turtle')))

Exemplo n.º 11

0

Exibir arquivo

 def __init__(self, endpoint, server=None, port=None, use_srv=True, wait=80,
         hold=4, requests=5, headers=None, PIPELINE=True, GZIP=True):
     PlugIn.__init__(self)
     self.DBG_LINE = 'bosh'
     self._exported_methods = [
         self.send, self.receive, self.disconnect,
     ]
     url = urlparse(endpoint)
     self._http_host = url.hostname
     self._http_path = url.path
     if url.port:
         self._http_port = url.port
     elif url.scheme == 'https':
         self._http_port = 443
     else:
         self._http_port = 80
     self._http_proto = url.scheme
     self._server = server
     self._port = port
     self.use_srv = use_srv
     self.Sid = None
     self._rid = 0
     self.wait = 80
     self.hold = hold
     self.requests = requests
     self._pipeline = None
     self.PIPELINE = PIPELINE
     if self.PIPELINE:
         self._respobjs = []
     else:
         self._respobjs = {}
     self.headers = headers or self.default_headers
     self.GZIP = GZIP

Exemplo n.º 12

0

Exibir arquivo

 def register_node(self, address):
     parsed_url = urlparse(address)
     if parsed_url.netloc:
         self.nodes.add(parsed_url.netloc)
     elif parsed_url.path:
         self.nodes.add(parsed_url.path)
     else:
         raise ValueError('Invalid URL')

Exemplo n.º 13

0

Exibir arquivo

Arquivo: getLinks.py Projeto: drugotosto/WebScraping

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
    print("\n\nDOMAIN della pagina",siteUrl,"e': ",domain)
    bsObj = BeautifulSoup(html,"html.parser")
    internalLinks = getInternalLinks(bsObj,domain)
    print("\nLa lista di link interni della pagina di partenza:",internalLinks)
    externalLinks = getExternalLinks(bsObj,domain)
    print("La lista di link esterni della pagina di partenza:",externalLinks,"\n")

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print("Agginta del link",link,"alla lista globale")
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)
    print("\n\nLa lista globale di tutti i link esterni del sito risulta essere:",allExtLinks)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: plugin.py Projeto: D0MF/supybot-plugins

    def _rickscore(self, url, score=0):

        parsed = urlparse(url)
        if not parsed[1]:
            return 0

        try:
            soup = self._url2soup(url)
            title = soup.find("title").string or ''
        except HTTPError, e:
            raise Exception, 'http error %s for %s' % (e.code, url)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: plugin.py Projeto: yo-bj/supybot-plugins

    def _rickscore(self, url, score=0):

        parsed = urlparse(url)
        if not parsed[1]:
            return 0

        try:
            soup = self._url2soup(url)
            title = soup.find("title").string or ''
        except HTTPError, e:
            raise Exception, 'http error %s for %s' % (e.code, url)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: util.py Projeto: HubSpot/homebrew-pypi-poet

def transform_url(url, **kwargs):
    url_parts = list(urlparse(url))

    for key, value in kwargs.items():
        try:
            index = _PARSED_URL_INDICES[key]
        except KeyError:
            continue

        url_parts[index] = value

    return urlunparse(tuple(url_parts))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: models.py Projeto: pombredanne/django-vintage

def get_upload_path(instance, filename):
    """
    Return the path based on the primary_key of the related page
    """
    from urlparse import urlparse
    parsed = urlparse(instance.original_url)
    directory_name = os.path.normpath(
        os.path.join('vintage', parsed.netloc,
                     os.path.dirname(parsed.path).strip('/')))
    new_filename = os.path.normpath(
        instance.content.storage.get_valid_name(os.path.basename(filename)))
    return os.path.join(directory_name, new_filename)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: metadata.py Projeto: NLeSC/ODEX-FAIRDataPoint

    def _validateURI(self, uri):
        if six.PY2:
            u = urlparse.urlparse(uri)
        else:
            u = urlparse(uri)

        if u.scheme not in ('http', 'https', 'ftp'):
            raise ValueError(
                "Missing/invalid URI scheme '%s' [http|https|ftp]." % uri)

        if u.netloc == '':
            raise ValueError('No host specified.')

        return uri

Exemplo n.º 19

0

Exibir arquivo

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
    print("\n\nDOMAIN della pagina", siteUrl, "e': ", domain)
    bsObj = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(bsObj, domain)
    print("\nLa lista di link interni della pagina di partenza:",
          internalLinks)
    externalLinks = getExternalLinks(bsObj, domain)
    print("La lista di link esterni della pagina di partenza:", externalLinks,
          "\n")

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print("Agginta del link", link, "alla lista globale")
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)
    print(
        "\n\nLa lista globale di tutti i link esterni del sito risulta essere:",
        allExtLinks)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: models.py Projeto: pombredanne/django-vintage

def get_upload_path(instance, filename):
    """
    Return the path based on the primary_key of the related page
    """
    from urlparse import urlparse
    parsed = urlparse(instance.original_url)
    directory_name = os.path.normpath(
        os.path.join(
            'vintage',
            parsed.netloc,
            os.path.dirname(parsed.path).strip('/'))
    )
    new_filename = os.path.normpath(
        instance.content.storage.get_valid_name(
            os.path.basename(filename)))
    return os.path.join(directory_name, new_filename)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_fdp.py Projeto: NLeSC/ODEX-FAIRDataPoint

def test_compare_triple_counts():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            # triple counts
            nt_fdp, nt_dump = len(g_fdp), len(g_dump)
            assert_equals(
               nt_fdp, nt_dump, 'Triple counts differ: %d (FDP) vs. %d (ref)' % (nt_fdp, nt_dump))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: __init__.py Projeto: opendatatrentino/ckanext-patstatweb

def _post_multipart(self, selector, fields, files):
    '''Post fields and files to an http host as multipart/form-data.

    :param fields: a sequence of (name, value) tuples for regular form
        fields
    :param files: a sequence of (name, filename, value) tuples for data to
        be uploaded as files

    :returns: the server's response page

    '''
    from urlparse import urljoin, urlparse

    content_type, body = self._encode_multipart_formdata(fields, files)

    headers = self._auth_headers()
    url = urljoin(self.base_location + urlparse(self.base_location).netloc,
                  selector)
    req = requests.post(
        url, data=dict(fields), files={files[0][0]: files[0][1:]},
        headers=headers
    )
    return req.status_code, req.error, req.headers, req.text

Exemplo n.º 23

0

Exibir arquivo

 def isallowed(self, root, url):
     root = urllib2.urlparse(root, "/")
     return self.robots[root].can_fetch(AGENTNAME, url)

Exemplo n.º 24

0

Exibir arquivo

def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return ''

Exemplo n.º 25

0

Exibir arquivo

class AibotSpider(scrapy.Spider):
    name = 'aibot'
    allowed_domains = []
    start_urls = []
    domains = []
    handle_httpstatus_list = [403, 404]
    urlsfile = open("base.csv", 'r')
    reader = csv.reader(urlsfile)
    data = {}
    for row in reader:
        if len(row) == 0:
            break

        print(row[0], '==============', row[1])
        urlsA = urlparse(row[0]).netloc
        if urlsA.startswith('www'):
            urls = urlsA.split('www.')[1]
            allowed_domains.append(urls)
            temp = urlsA.split('.')
            data[temp[1]] = row[1]
        else:
            allowed_domains.append(urlsA)
            temp = urlsA.split('.')
            data[temp[0]] = row[1]
        domains.append(row[1])
        start_urls.append("http://" + urlsA + "/")

    urlsfile.close()
    print("=================the start urls", start_urls)
    print("==================the allowed domains", allowed_domains)
    print("===================the domains", domains)

    valid_url = []
    invalid_url = []
    count = 0
    maxdepth = 7
    domain = ''

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):

        from_url = ''
        from_text = ''
        depth = 0

        if 'from' in response.meta: from_url = response.meta['from']
        if 'text' in response.meta: from_text = response.meta['text']
        if 'depth' in response.meta: depth = response.meta['depth']

        if response.status in [404, 400, 301, 302, 500]:
            self.invalid_url.append({
                'url': response.url,
                'from': from_url,
                'text': from_text
            })
        else:

            self.valid_url.append({
                'url': response.url,
                'from': from_url,
                'text': from_text
            })

            gowtham = AdvancedspiderItem()
            gowtham['urls'] = response.url
            yield gowtham

            print(depth, response.url, '<-', from_url, from_text)

            with open('dir_test.txt', 'a+') as f:
                AibotSpider.count += 1
                if AibotSpider.count <= 500000:
                    print('count=%d depth=%d' % (AibotSpider.count, depth),
                          response.url,
                          '<-',
                          from_url,
                          from_text,
                          sep=' ')

                    f.write('%s \r\n' % (response.url))

            if depth < self.maxdepth:

                a_selectors = response.xpath("//a")

                for selector in a_selectors:
                    text = selector.xpath('text()').extract_first()

                    link = selector.xpath('@href').extract_first()

                    request = response.follow(link, callback=self.parse)
                    request.meta['from'] = response.url
                    request.meta['text'] = text
                    request.meta['depth'] = depth + 1

                    yield request

Exemplo n.º 26

0

Exibir arquivo

Arquivo: ebayreviewconnector.py Projeto: jsyadav/CrawlerFramework

    def __addReviews(self):

        '''
        next_page is used to determine cases in which there is single page of reviews , so it will be null for first function call
        
        ebay - the section [top reviews] , if it's present for a product
        case 1        if number of pages = 1 , top review won't be repeated 
        case 2        if number of pages > 1 , apart from coming in top reviews , it will come as a part of reviews in later pages ( so 2 time    s , top reviews(page 1 , page x later)
        problem :
        for no updates case , we continue till we reach the review we already crawled , evaluating using unique review_id
        which fails for case 2
        solution:
        for review pages more than 1 , don't pick top reviews or case 2
        '''
        #reviews  = self.soup.findAll('div',{'class':re.compile('\s*singlereview')}) #that are top reviews and general reviews
        reviews  = self.soup.findAll('div','rvw')
        log.info(len(reviews))
        for review in reviews:
            page ={}           
            try:
                page['ef_rating_overall'] = float(review.find('div','RatingBar').\
                                            find('span',attrs ={'class':re.compile('rating-lf avs\d+')})['class'].\
                                            strip().split('avs')[-1])
                
##                                       float(review.find('div','cll rrbr').\
##                                            find('img')['src'].split('/')[-1].\
##                                            split('_')[-1].split('.gif')[0])
            except:
                log.exception(self.log_msg('could not parse overall rating'))

            try:
                page['data'] =  stripHtml(review.find('div','cll con').\
                                renderContents()).replace('\n>','')
            except:
                page['data'] = ''
                log.exception(self.log_msg('could not parse review data'))

            try:
                num_helpful = review.find('div',{'class':'cll hlp'})
                if num_helpful:
                    page['ei_data_recommended_yes'] = int(num_helpful.strong.renderContents())
                    page['ei_data_recommended_total'] = int(num_helpful.findAll('strong')[1].\
                                                        renderContents())
            except:
                log.exception(self.log_msg('could not parse review helpfulness'))
            try:
                page['title'] = stripHtml(review.find('h4','g-b').renderContents())
            except:
                page['title'] = ''
                log.exception(self.log_msg('could not parse review title'))
            try:
                date_str = stripHtml(review.find('div','dte cllr').renderContents()).split(':')[-1].strip()
                page['posted_date'] =  datetime.strptime(date_str,"%m/%d/%y").strftime("%Y-%m-%dT%H:%M:%SZ")
            except:
                log.exception(self.log_msg('could not parse posted_date'))
                page['posted_date'] =  datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")    
                
            try:
                page['et_author_name'] = stripHtml(review.find('span','mbg-nw').renderContents())
            except:
                log.exception(self.log_msg('could not parse author name'))
            copyuri = self.currenturi 
            try:
                page['et_author_profile']=review.find('span','mbg-nw').findParent('a')['href']
                self.currenturi = page['et_author_profile']
                self.__setSoup()
                try:
                    member_info = stripHtml(self.soup.find('div' , {'id':'MemberInfo'}).renderContents())
                    info_dict = dict([line.split(':') for line in re.split(re.compile(r'\n+'),member_info) if line.strip()])
                    info_dict = dict( [ [k.strip(),v.strip()] for k,v in info_dict.items()])
                except:
                    info_dict = {}
                    log.info(self.log_msg('could not parse member information'))
                if info_dict.get('Location'):
                    page['et_author_location'] = info_dict['Location'].strip()
                if info_dict.get('Member since'):
                    member_since = info_dict['Member since'].strip()
                    page['edate_author_member_since'] = datetime.strftime(datetime.strptime(member_since,'%b-%d-%y'),"%Y-%m-%dT%H:%M:%SZ")
                self.currenturi = copyuri
            except:
                log.exception(self.log_msg('could not parse author profile link'))    
            try:    
                review_hash = get_hash( page )    
                unique_key = get_hash({'data' : page['data'], 'posted_date' : page['posted_date']})
                if checkSessionInfo(self.genre, self.session_info_out, unique_key,self.\
                                task.instance_data.get('update'),parent_list=[ self.parenturi ]):
                    log.info(self.log_msg('Session info returns True for uri %s'% \
                                                                unique_key))
                    return False	
                result=updateSessionInfo(self.genre, self.session_info_out, unique_key \
                                    , review_hash,'Review', self.task.instance_data.get('update'), \
                                                                        parent_list=[ self.parenturi ])
                if result['updated']:
                    parent_list = [ self.parenturi ]
                    page['parent_path'] = copy.copy( parent_list )
                    page['path']=parent_list
                    page['entity'] = 'Review'
                    page['uri'] = normalize( self.currenturi )
                    page['uri_domain'] = urlparse(page['uri'])[1]
##                    parent_soup = copy.copy(self.soup)
##                    try:
##                        if self.task.instance_data.get('pick_user_info') and page.get('et_author_profile'):
##                            self.__getUserInfo(page , page['et_author_profile'])
##                        else:
##                            log.info(self.log_msg('could not get user profile link or pick_user_info option is not enabled'))
##                    except:
##                            log.exception(self.log_msg('could not parse user information'))
##                    self.soup = parent_soup
                    page.update(self.__task_elements_dict)
                    self.pages.append(page)
                    log.info('page added for %s'%self.currenturi)
                else:
                    log.exception(self.log_msg('Update session info returns False for \
                                                    url %s'%self.currenturi))
            except:
                log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
        return True

Exemplo n.º 27

0

Exibir arquivo

Arquivo: BookMarkUtil.py Projeto: pdam/Article-Manager

 def isallowed(self, root, url):
     root = urllib2.urlparse(root, "/")
     return self.robots[root].can_fetch(AGENTNAME, url)

Exemplo n.º 28

0

Exibir arquivo

def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc  # network location
    except:
        return ''

Exemplo n.º 29

0

Exibir arquivo

Arquivo: util.py Projeto: HubSpot/homebrew-pypi-poet

def extract_credentials_from_url(url):
    parsed_url = urlparse(url)
    url_without_credentials = transform_url(url, netloc=parsed_url.hostname)
    return url_without_credentials, parsed_url.username, parsed_url.password

Exemplo n.º 30

0

Exibir arquivo

def process_batch(sm_account_id,
                  graph,
                  interactions,
                  batch_requests,
                  p_session,
                  processed_interactions=None,
                  cutoff=None):
    """
    A function that sends batch requests to FB, collects the results and prepares the
    next set of batch requests for data corresponding to pagination.
    Call itself recursively until all posts in the given period are fetched.
    :param sm_account_id:
    :param graph:
    :param interactions:
    :param batch_requests:
    :param p_session:
    :param processed_interactions: Number of interactions already processed
    :param cutoff: stop collection if processed_interactions exceeds cutoff
    :return:
    """
    with transaction.manager:
        for interaction in interactions:
            p_session.merge(interaction)

    if len(batch_requests) == 0 or (processed_interactions
                                    and processed_interactions >= cutoff):
        return

    # process batch requests
    # Number of max items in a batch request is 50
    MAX_BATCH_SIZE = 50
    batch_requests_p = [{
        'method': req.get('method'),
        'relative_url': req.get('relative_url')
    } for req in batch_requests]
    batch_data = []

    interactions_new = set()
    batch_requests_new = []

    for i in range(math.ceil(len(batch_requests_p) / MAX_BATCH_SIZE)):
        # TODO handle connection error. attempt retries
        try:
            batch_req = json.dumps(
                batch_requests_p[i * MAX_BATCH_SIZE:(i * MAX_BATCH_SIZE) +
                                 (MAX_BATCH_SIZE - 1)],
                indent=1)
            batch_data += graph.request("", post_args={'batch': batch_req})

        except ConnectionError as e:
            logger.exception(
                'unable to process batch request \n:{}'.format(batch_req))
    for req, batch_response in zip(batch_requests, batch_data):
        parent_id = req.get('parent_id')
        if 'body' in batch_response:
            batch_response_data = json.loads(batch_response['body'])
            if 'error' in batch_response_data and batch_response_data[
                    'error'].get('code') == 1:
                # handle request failure - 'Please reduce the amount of data you are asking for, then retry your request'
                error_url = req.get('relative_url')
                parse_result = urlparse(error_url)
                query_data = urlparse.parse_qs(parse_result.query)
                old_limit = query_data.get('limit')[0]
                sm_account_id = parse_result.path.split("/")[2]
                new_limit = int(float(old_limit) / 2)
                new_req = get_feed_request(sm_account_id, limit=new_limit)
                batch_requests_new.append(new_req)

            if 'data' in batch_response_data:
                for interaction_raw in batch_response_data['data']:
                    Interactions.get_nested_interactions(
                        sm_account_id, interaction_raw, interactions_new,
                        batch_requests_new, parent_id)
            if 'paging' in batch_response_data and 'next' in batch_response_data[
                    'paging']:
                next_url = urlparse(batch_response_data['paging']['next'])
                relative_url = next_url.path + '?' + next_url.query + '&include_headers=false'
                req = {
                    'method': 'GET',
                    'relative_url': relative_url,
                    'parent_id': parent_id
                }
                batch_requests_new.append(req)
        else:
            logger.info(
                'Exception occurred while collecting posts for {} skipping this..'
                .format(sm_account_id))

    process_batch(sm_account_id, graph, interactions_new, batch_requests_new,
                  p_session, processed_interactions + len(interactions),
                  cutoff)