Пример #1
0
def classify_link(link):
    ''' classify link according to its domain
    '''
    if link is None:
        return link, SITE_TYPE['junk']
    original_url = link
    url = urlparse.urlparse(link)
    max_try_count = 10
    try_count = 0
    while url.netloc in _SHORT_SERVICE:
        if try_count >= max_try_count:
            # if multiple redirect, return as news
            return link, SITE_TYPE['news']
        #get original link of short link
        original_url = _get_original_link(original_url)
        url = urlparse.urlparse(original_url)
        try_count += 1
    domain_token = url.netloc.split('.')
    length = len(domain_token) - 2
    while length >= 0:
        domain = '.'.join(domain_token[length:])
        if domain in _BLACK_SITE_LIST:
            return original_url, _BLACK_SITE_LIST[domain]
        length -= 1
    #treat unclassified link as news link
    return original_url, SITE_TYPE['news']
Пример #2
0
 def __init__(self, announce, piece_length=262144, **kw):
     self.piece_length = piece_length
     if not bool(urlparse.urlparse(announce).scheme):
         raise ValueError('No schema present for url')
     self.tdict = {
         'announce': announce,
         'creation date': int(time()),
         'info': {
             'piece length': self.piece_length
         }
     }
     if kw.get('comment'):
         self.tdict.update({'comment': kw.get('comment')})
     if kw.get('httpseeds'):
         if not isinstance(kw.get('httpseeds'), list):
             raise TypeError('httpseeds must be a list')
         else:
             self.tdict.update({'httpseeds': kw.get('httpseeds')})
     if kw.get('announcelist'):
         if not isinstance(kw.get('announcelist'), list):
             raise TypeError('announcelist must be a list of lists')
         if False in [isinstance(l, list) for l in kw.get('announcelist')]:
             raise TypeError('announcelist must be a list of lists')
         if False in [
                 bool(urlparse.urlparse(f[0]).scheme)
                 for f in kw.get('announcelist')
         ]:
             raise ValueError('No schema present for url')
         else:
             self.tdict.update({'announce-list': kw.get('announcelist')})
Пример #3
0
 def __init__(self, announce, piece_length=262144, **kw):
     self.piece_length = piece_length
     if not bool(urlparse.urlparse(announce).scheme):
         raise ValueError('No schema present for url')
     self.tdict = {
         'announce': announce,
         'creation date': int(time()),
         'info': {
             'piece length': self.piece_length
         }
     }
     if kw.get('comment'):
         self.tdict.update({'comment': kw.get('comment')})
     if kw.get('httpseeds'):
         if not isinstance(kw.get('httpseeds'), list):
             raise TypeError('httpseeds must be a list')
         else:
             self.tdict.update({'httpseeds': kw.get('httpseeds')})
     if kw.get('announcelist'):
         if not isinstance(kw.get('announcelist'), list):
             raise TypeError('announcelist must be a list of lists')
         if False in [isinstance(l, list) for l in kw.get('announcelist')]:
             raise TypeError('announcelist must be a list of lists')
         if False in [bool(urlparse.urlparse(f[0]).scheme) for f in kw.get('announcelist')]:
             raise ValueError('No schema present for url')
         else:
             self.tdict.update({'announce-list': kw.get('announcelist')})
Пример #4
0
def classify_link(link):
    ''' classify link according to its domain
    '''
    if link is None:
        return link, SITE_TYPE['junk']
    original_url = link
    url = urlparse.urlparse(link)
    max_try_count = 10
    try_count = 0
    while url.netloc in _SHORT_SERVICE:
        if try_count >= max_try_count:
            # if multiple redirect, return as news
            return link, SITE_TYPE['news']
        #get original link of short link
        original_url = _get_original_link(original_url)
        url = urlparse.urlparse(original_url)
        try_count += 1
    domain_token = url.netloc.split('.')
    length = len(domain_token) - 2
    while length >= 0:
        domain = '.'.join(domain_token[length:])
        if domain in _BLACK_SITE_LIST:
            return original_url, _BLACK_SITE_LIST[domain]
        length -= 1
    #treat unclassified link as news link
    return original_url, SITE_TYPE['news']
 def __getParentPage(self):
     '''
         This will get the Parent Page info
     '''
     page = {}
     try:
         self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')]
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
     try:
        self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents())
     except:
         log.info(self.log_msg('Title Not Found'))
         page['title'] = ''
     if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True'))
         return False
     for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']:
         try:
             page[each] = self.task.pagedata[each]
         except:
             log.info(self.log_msg('Page data cannot be extracted for %s'%each))
     try:
         page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1])
     except:
         log.info(self.log_msg('Thread id not found'))
     try:
         post_hash = get_hash(page)
         id = None
         if self.session_info_out == {}:
             id = self.task.id
         result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id)
         if not result['updated']:
             return False
         page['path'] = [self.parent_uri]
         page['parent_path'] = []
         page['uri'] = normalize(self.currenturi)
         page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
         page['priority'] = self.task.priority
         page['level'] = self.task.level
         page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ')
         page['connector_instance_log_id'] = self.task.connector_instance_log_id
         page['connector_instance_id'] = self.task.connector_instance_id
         page['workspace_id'] = self.task.workspace_id
         page['client_id'] = self.task.client_id
         page['client_name'] = self.task.client_name
         page['last_updated_time'] = page['pickup_date']
         page['versioned'] = False
         page['data'] = ''
         page['task_log_id']=self.task.id
         page['entity'] = 'Post'
         page['category']=self.task.instance_data.get('category','')
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Parent Page added'))
         return True
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
         return False
Пример #6
0
def completeurl(fullurl, partialurl):
    from urllib2 import urlparse
    parsed_jobsurl = urlparse.urlparse(fullurl)
    parsed_joburl = urlparse.urlparse(partialurl)
    fulljoburl = urlparse.urlunparse([parsed_jobsurl.scheme, parsed_jobsurl.netloc,
                                      parsed_joburl.path, parsed_joburl.params, parsed_joburl.query,
                                      parsed_joburl.fragment])
    return fulljoburl
Пример #7
0
 def convert(self, value, context, ctx_opts):
   if value[:4] != u'http':
     value = u'http://%s' % value
   domain = urlparse.urlparse(value)[1]
   if not domain or domain == u'':
     domain = urlparse.urlparse(u'http://%s' % value)[1]
   if not domain or len(domain.split(u'.')) < 2 or \
     len(domain.split(u' ')) > 1:
     self.error('invalid_domain', value, context, ctx_opts)
   return domain.lower()
Пример #8
0
    def homepage_url(self):
        """Try ensure we prepend http: to the url if there's nothing there

           This is to ensure we're not generating relative links in the
           user templates."""
        if not self.homepage:
            return self.homepage
        parsed = urlparse.urlparse(self.homepage)
        if parsed.scheme:
            return self.homepage
        # Vague sanity check
        abs_url = ''.join(['http://', self.homepage])
        if urlparse.urlparse(abs_url).scheme == 'http':
            return abs_url
        return self.homepage
Пример #9
0
    def homepage_url(self):
        """Try ensure we prepend http: to the url if there's nothing there

           This is to ensure we're not generating relative links in the
           user templates."""
        if not self.homepage:
            return self.homepage
        parsed = urlparse.urlparse(self.homepage)
        if parsed.scheme:
            return self.homepage
        # Vague sanity check
        abs_url = ''.join(['http://', self.homepage])
        if urlparse.urlparse(abs_url).scheme == 'http':
            return abs_url
        return self.homepage
Пример #10
0
    def test_execute_async(self, call_rpc_client_mock):
        with patch.object(ResponseUnavailableViewMixing, 'verify', return_value=None) as mock_method:
            call_rpc_client_mock.return_value = True
            self.client1.login(username='******', password='******')
            from urllib2 import urlparse

            # get redirect
            response = self.client1.get(self.base1_apy1.get_exec_url()+"&async")
            self.assertEqual(301, response.status_code)
            queries = urlparse.urlparse(response['Location'])[4]
            rid = int(urlparse.parse_qs(queries)['rid'][0])
            transaction = Transaction.objects.get(pk=rid)

            # get state (RUNNING)
            #response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid, HTTP_ACCEPT='application/xml')
            response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid)
            self.assertEqual(200, response.status_code)
            tout = {u'status': u'RUNNING', "url": "/fastapp/base/base1/exec/base1_apy1/?json=&rid="+str(rid), 'rid': rid, 'id': u'base1_apy1'}
            self.assertEqual(json.loads(response.content)['rid'], tout['rid'])

            # mock creation of response
            tout = {u'status': u'OK', u'exception': None, u'returned': u'{"content": "{\\"aaa\\": \\"aaa\\"}", "class": "XMLResponse", "content_type": "application/json"}', u'response_class': u'JSONResponse', 'time_ms': '74', 'rid': rid, 'id': u'base1_apy1'}
            transaction.tout = tout
            transaction.save()
            self.assertEqual(transaction.apy, self.base1_apy1)

            # get response
            response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid)
            self.assertEqual(200, response.status_code)

            # check transaction duration
            transaction = Transaction.objects.get(pk=rid)
            self.assertEqual(int, type(transaction.duration))
Пример #11
0
    def send_email(self, to='', subject='', body='', cc='', bcc=''):
        log.info('sending a mail')
        data = dict(nvp_bu_send='Send')
        for name in 'to subject body cc bcc'.split():
            if vars()[name]:
                data[name] = vars()[name].encode('utf-8')

        if not hasattr(self, 'sendpath'):
            response = self.internal_http_opener.open(self.internalBaseMailUrl + '?ui=html')
            from urllib2 import urlparse
            respurl = urlparse.urlparse(response.geturl())
            try:
                response.close()
            except: pass
            del response
            self.sendpath = respurl.path
        url = 'https://mail.google.com' + self.sendpath
        try:
            at = self.gmail_at
        except KeyError:
            at = ''
        params = dict(at=at, v='b', pv='tl', s='s', fv='b', cpt='c', cs='c')
        if not self.hosted:
            params.update(fv='b', cpt='c', cs='c')
        else:
            params.update(cs='b', s='s')

        url = UrlQuery(url, params)

        response = self.webrequest(url, follow_js_redirects=True, **data)
        log.info('sent a mail')
        assert response and ('Your message has been sent.' in response)
        log.info('send mail success: %r', bool('Your message has been sent.' in response))
        return True
Пример #12
0
    def get_document(self, url):
        """
            Connects to the server and retrieves the document
        """
        set_status(_('Contacting SomaFM server...'))
        hostinfo = urlparse.urlparse(url)

        try:
            c = httplib.HTTPConnection(hostinfo.netloc, timeout=20)
        except TypeError:
            c = httplib.HTTPConnection(hostinfo.netloc)

        try:
            c.request('GET',
                      hostinfo.path,
                      headers={'User-Agent': self.user_agent})
            response = c.getresponse()
        except (socket.timeout, socket.error):
            raise radio.RadioException(_('Error connecting to SomaFM server.'))

        if response.status != 200:
            raise radio.RadioException(_('Error connecting to SomaFM server.'))

        document = response.read()
        c.close()

        set_status('')
        return document
Пример #13
0
	def on_navigation_requested(self,view,frame,req,data=None):
		uri = req.get_uri()
		parse = urlparse.urlparse(uri)
		if self.url_callback.find(parse.hostname) > 0:
			self.getAccessToken(parse)
			return True
		return False
Пример #14
0
    def search_regulars(self):
        """
            Search urls inside the <A> tags
        """

        urls = set()

        tree = XPathExtractor().get_object(self.response.raw_html)

        for link_tag in tree.xpath("//a"):

            if not 'href' in link_tag.attrib:
                continue

            url = link_tag.attrib["href"]

            if not urlparse.urlparse(url).netloc:

                url = self._fix_url(url)

            url = self._normalize_url(url)

            urls.add(url)

        return urls
    def search1(self, search):
      url_list = []   #store all the extracted urls in a List
      title_list = [] #store all the extracted titles in a List
      description_list = []  #store all the extracted Description in a List

      for start in range(0,10):
          page = requests.get('http://www.google.de/search?q='+search+str(start*10), verify = False)
          soup = BeautifulSoup(page.text,'html.parser')

          for cite in soup.findAll('a', attrs={'class':'r'}): #extrcat all URLs
              url = cite.text
              print url
              if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not
	             url = 'http://'+url
	             print url
              url_list.append(url.replace('https://','http://')) 
          for tit in soup.findAll('div', attrs={'class':'ellip'}): #extract all Titles
              print tit.text
              title_list.append(tit.text)
            
       
          for descr in soup.findAll('span', attrs={'class':'st'}): #extraxt all description
              print descr.text
              description_list.append(descr.text)

      print title_list
Пример #16
0
def is_url(name):
    try:
        result = urlparse.urlparse(name)
    except Exception:
        return False
    else:
        return result.scheme in ('http', 'https', 'file', 'ftp')
Пример #17
0
    def __call__(self, **kwargs):
        field = self.context.getField('dataTitle')
        if field:
            self.info['source']['title'] = field.getAccessor(self.context)()

        field = self.context.getField('dataLink')
        if field:
            self.info['source']['url'] = field.getAccessor(self.context)()

        field = self.context.getField('dataOwner')
        if field:
            vocab = field.Vocabulary(self.context)
            url = field.getAccessor(self.context)()
            title = self.context.displayValue(vocab, url)
            self.info['owner']['title'] = title

            parser = urlparse.urlparse(url)
            if all((parser.scheme, parser.netloc)):
                self.info['owner']['url'] = url
            else:
                self.info['owner']['url'] = self.info['source']['url']

        if self.info['source']['title'] or self.info['source']['url']:
            return self.info
        return self.fallback()
Пример #18
0
    def get_url_list(self, xmlrpc_uri):
        """
        Create a list of urls consisting of the available IPA servers.
        """
        # the configured URL defines what we use for the discovered servers
        (scheme, netloc, path, params, query,
         fragment) = urlparse.urlparse(xmlrpc_uri)
        servers = []
        name = '_ldap._tcp.%s.' % self.env.domain

        rs = dnsclient.query(name, dnsclient.DNS_C_IN, dnsclient.DNS_T_SRV)
        for r in rs:
            if r.dns_type == dnsclient.DNS_T_SRV:
                rsrv = r.rdata.server.rstrip('.')
                servers.append('https://%s%s' %
                               (ipautil.format_netloc(rsrv), path))

        servers = list(set(servers))
        # the list/set conversion won't preserve order so stick in the
        # local config file version here.
        cfg_server = xmlrpc_uri
        if cfg_server in servers:
            # make sure the configured master server is there just once and
            # it is the first one
            servers.remove(cfg_server)
            servers.insert(0, cfg_server)
        else:
            servers.insert(0, cfg_server)

        return servers
Пример #19
0
    def get_lists(self, no_cache=False):
        """
            Returns the rlists for icecast
        """
        from xlgui.panel import radio
        if no_cache or not self.data:
            set_status(_('Contacting Icecast server...'))
            hostinfo = urlparse.urlparse(self.genre_url)
            try:
                c = httplib.HTTPConnection(hostinfo.netloc, timeout=20)
            except TypeError:  # python 2.5 doesnt have timeout=
                c = httplib.HTTPConnection(hostinfo.netloc)
            try:
                c.request('GET',
                          hostinfo.path,
                          headers={'User-Agent': self.user_agent})
                response = c.getresponse()
            except (socket.timeout, socket.error):
                raise radio.RadioException(
                    _('Error connecting to Icecast server.'))

            if response.status != 200:
                raise radio.RadioException(
                    _('Error connecting to Icecast server.'))

            body = response.read()
            c.close()
            set_status('')

            data = {}
            dom = minidom.parseString(body)
            divs = dom.getElementsByTagName('div')
            for div in divs:
                if div.getAttribute('id') == 'content':
                    anchors = div.getElementsByTagName('a')
                    for anchor in anchors:
                        anchor.normalize()
                        for node in anchor.childNodes:
                            if node.nodeType == minidom.Node.TEXT_NODE:
                                data[node.nodeValue] = anchor.getAttribute(
                                    'href')
                                break
                    break
            self.data = data
            self._save_cache()
        else:
            data = self.data
        rlists = []

        for item in data.keys():
            rlist = RadioList(item, station=self)
            rlist.get_items = lambda no_cache, name=item: \
                self._get_subrlists(name=name, no_cache=no_cache)
            rlists.append(rlist)

        sort_list = [(item.name, item) for item in rlists]
        sort_list.sort()
        rlists = [item[1] for item in sort_list]
        self.rlists = rlists
        return rlists
Пример #20
0
    def generate_cookie(self, url_path, session_id, expiration=None, add_header=False):
        '''
        Return a session cookie containing the session id. The cookie
        will be contrainted to the url path, defined for use
        with HTTP only, and only returned on secure connections (SSL).

        :parameters:
          url_path
            The cookie will be returned in a request if it begins
            with this url path.
          session_id
            The session id identified by the session cookie
          add_header
            If true format cookie string with Set-Cookie: header

        :returns:
          cookie string
        '''

        if not expiration:      # Catch zero unix timestamps
            expiration = None;

        cookie = Cookie(self.session_cookie_name, session_id,
                        domain=urlparse.urlparse(api.env.xmlrpc_uri).netloc,
                        path=url_path, httponly=True, secure=True,
                        expires=expiration)
        if add_header:
            result = 'Set-Cookie: %s' % cookie
        else:
            result = str(cookie)

        return result
Пример #21
0
    def write(self, url, start_time, end_time, ok, reason):
        '''
        @param url: 这个url必须是完整的http请求地址如:  http://127.0.0.1:8080/xxx/?sdfa=fas
        @param ok: boolean -> True or False
        @param reason: 错误原因,字符串,不能包含英文逗号','
        '''
        process_time = int((end_time - start_time) * 1000)  # 毫秒

        urlps = urlparse.urlparse(url)
        host = '%s%s' % (urlps.hostname,
                         (':%s' % urlps.port) if urlps.port else '')
        url = urlps.path or '/'

        for iu in self.ignore_url:
            if url.startswith(iu):
                return
        ok = ok and 1 or 0  # and 'true' or 'false'
        if self.log_format == 'comma':
            msg = '%s,%s,%s,%s,%s,%s,%s,%s' % (
                self.server_name, self.project_name, host, url, process_time,
                ok, int(start_time), reason)

        elif self.log_format == 'json':
            msg = {
                'server_name': self.server_name,
                'project_name': self.project_name,
                'host': host,
                'url': url,
                'create_time': int(start_time),
                'process_time': process_time,
                'ok': ok,
                'reason': reason
            }
            msg = simplejson.dumps(msg)
        self.write_log(msg)
Пример #22
0
 def __addPosts(self,link):
     '''It will add the post
     '''
     try:
         self.currenturi = link
         if checkSessionInfo('review', self.session_info_out, self.currenturi, \
                         self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                            %self.currenturi))
             return False
         self.__setSoupForCurrentUri()    
         page = self.__getData()
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             self.currenturi,get_hash( page ),'review', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [ self.currenturi]
             page['parent_path'] = []
             page['uri']= self.currenturi 
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page['entity'] = 'review'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                             url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
         return False 
Пример #23
0
def open_source(url, decode=True):
    parsed = urlparse.urlparse(url)
    if parsed.scheme and len(parsed.scheme)>0 and parsed.scheme[0] == "z":
        import zcom
        return zcom.Connection(url, codec=decode).items()
    else:
        return sharditerator(url, decode=decode, source=url)
Пример #24
0
def gopen(url, mode="rb"):
    """Open the given URL. Supports unusual schemes and uses subprocesses."""
    parsed = urlparse.urlparse(url)
    if parsed.scheme == "gs":
        if mode[0]=="r":
            return os.popen("gsutil cat '%s'" % url, "rb")
        elif mode[0]=="w":
            return os.popen("gsutil cp - '%s'" % url, "wb")
        else:
            raise ValueError("{}: unknown mode".format(mode))
    elif parsed.scheme in "http https ftp".split():
        if mode[0]=="r":
            cmd = "curl --fail -s '%s'" % url
            return os.popen(cmd, "rb")
        elif mode[0]=="w":
            test_curl_write(url)
            cmd = "curl --fail -s -T - '%s'" % url
            return os.popen(cmd, "wb")
        else:
            raise ValueError("{}: unknown mode".format(mode))
    elif parsed.scheme in ["", "file"]:
        if mode[0]=="r":
            return open(parsed.path, "rb")
        elif mode[0]=="w":
            return open(parsed.path, "wb")
        else:
            raise ValueError("{}: unknown mode".format(mode))
Пример #25
0
    def search_regulars(self):
        """
            Search urls inside the <A> tags
        """

        urls = set()

        tree = XPathExtractor().get_object(self.response.raw_html)

        for link_tag in tree.xpath("//a"):

            if not 'href' in link_tag.attrib:
                continue

            url = link_tag.attrib["href"]

            if not urlparse.urlparse(url).netloc:

                url = self._fix_url(url)

            url = self._normalize_url(url)

            urls.add(url)

        return urls
Пример #26
0
def is_file(name):
    try:
        result = urlparse.urlparse(name)
    except Exception:
        return False
    else:
        return result.scheme == 'file'
Пример #27
0
    def __addPosts(self, links, parent_list):
        """Given a list of links to the discussion post, fetch the post contents and the author info
        """
        h = HTTPConnection()
        for link in links:
            try:
                page = {}
                object_id = re.search('objectID=(\d+)', link).group(1)
                link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id)
                # Using the redirected url instead of the url given by the search page
                self.currenturi = link
                page['uri'] = normalize(link)
                log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi)))
                if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                    self.task.instance_data.get('update'), parent_list=parent_list):
                    # No need to pick this page
                    continue
                res = self._getHTML()

                self.rawpage = res['result']
                self._setCurrentPage()
                # First try extracting from the post body
                if not self.__extractPostBody(page, object_id):
                    # if that fails, extract from the replies
                    self.__extractReplyBody(page, object_id)

            except:
                log.exception(self.log_msg("exception in extracting page"))
                continue
            page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ")

            checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest()
            id = None
            if self.session_info_out=={}:
                id = self.task.id
            result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                       checksum, 'Post', self.task.instance_data.get('update'),
                                       parent_list=parent_list, Id=id)
            if result['updated']:
                page['path'] =  page['parent_path'] = parent_list
                page['path'].append(self.currenturi)
                page['priority']=self.task.priority
                page['level']=self.task.level
                page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                page['connector_instance_log_id'] = self.task.connector_instance_log_id
                page['connector_instance_id'] = self.task.connector_instance_id
                page['workspace_id'] = self.task.workspace_id
                page['client_id'] = self.task.client_id  # TODO: Get the client from the project 
                page['client_name'] = self.task.client_name
                page['last_updated_time'] = page['pickup_date']
                page['versioned'] = False
                page['entity'] = 'Review'
                page['category'] = self.task.instance_data.get('category','')
                page['task_log_id']=self.task.id
                page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                

            # Calculate the hash and get the session info thingy
            self.pages.append(page)
        return True
Пример #28
0
    def fetch_photos_from_msg(self, album, msg=None):
        u = album.user
        token = get_access_token(u)
        graph = facebook.GraphAPI(token)

        if msg.status == 'awaiting':
            parts = urlparse.urlparse(msg.next_page)
            qs = urlparse.parse_qs(parts.query)
            after = qs.get('after')[0]
            photos = graph.get_object(album.fb_album_id + "/photos", fields='id,source', limit=2, after=after)
            new_next_page = photos.get('paging').get('next')
            new_msg = Message.objects.create(next_page=new_next_page, user=u, status='awaiting')
            for photo in photos.get('data'):
                img_temp = NamedTemporaryFile(delete=True)
                img_temp.write(urlopen(photo.get('source')).read())
                img_temp.flush()
                photo_object = Photo.objects.create(title=photo.get('id'),
                    description=photo.get('created_time'),
                    album=album,
                    file=File(img_temp))
                pprint(photo_object.filename)
                self.stdout.write('Successfully fetched photo for source "%s"\n' % photo.get('source'))
            msg.status = 'done'
            msg.save()
            self.stdout.write('Finished this queue "%s"\n' % new_msg.next_page)
 def __setParentPage(self):
     """This will get the parent info
     """
     page = {}
     try:
         page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:]
         page['data'] = page['title'] = page['et_thread_hierarchy'][-1]
     except:
         log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\
                                                         %s'%self.currenturi))
         return
     if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return
     try:
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [self.task.instance_data['uri']] 
             page['parent_path'] = []
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['data'] = ''
             page['entity'] = 'thread'
             page.update(self.__task_elements_dict)
             page['posted_date'] = page['pickup_date']
             self.pages.append(page)
             log.info(self.log_msg('Parent Page Added'))
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg("parent post couldn't be parsed"))
Пример #30
0
 def write(self, url, start_time, end_time, ok, reason):
     '''
     @param url: 这个url必须是完整的http请求地址如:  http://127.0.0.1:8080/xxx/?sdfa=fas
     @param ok: boolean -> True or False
     @param reason: 错误原因,字符串,不能包含英文逗号','
     '''
     process_time = int((end_time - start_time) * 1000) # 毫秒
     
     urlps = urlparse.urlparse(url)
     host = '%s%s' %(urlps.hostname, (':%s' %urlps.port) if urlps.port else '')
     url = urlps.path or '/'
     
     for iu in self.ignore_url:
         if url.startswith(iu):
             return
     ok = ok and 1 or 0 # and 'true' or 'false'
     if self.log_format == 'comma':
         msg = '%s,%s,%s,%s,%s,%s,%s,%s' %(self.server_name, self.project_name, host, url, process_time, ok, int(start_time), reason)
         
     elif self.log_format == 'json':
         msg = { 'server_name': self.server_name, 'project_name': self.project_name, 
                 'host': host,
                 'url': url, 'create_time': int(start_time), 'process_time': process_time,
                 'ok': ok, 'reason': reason}
         msg = simplejson.dumps(msg)
     self.write_log(msg)
Пример #31
0
 def __getParentPage(self,comment):
     """This will get the parent info
     """
     page = {}
     try:
         self.__total_replies_count = page['ei_data_replies_count'] = int(stripHtml(comment.find('totalreplies').renderContents()))
         page['title'] = page['data'] = stripHtml(comment.find('name').renderContents())
         page['posted_date'] = stripHtml(comment.find('dateadded').renderContents()).split('.')[0]
         unique_key = stripHtml(comment.find('messageid').renderContents())
         if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],\
                                      self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info return True, Already exists'))
             return
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path']=[unique_key] 
             page['parent_path']=[]
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['entity'] = 'post'
             page.update(self.__task_elements_dict)
             log.info(page['data'])
             self.pages.append(page)
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg('Hierachy/Title not found in url %s'%self.currenturi))
         return
 def __addPost(self, post, is_question=False):
     try:
         unique_key = re.search(r'(\d+)', post['id']).groups()[0]
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = self.__baseuri + 'showpost.php?p=' + unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
Пример #33
0
    def __call__(self, **kwargs):

        field = self.context.getField('provenances')
        provenances = field.getAccessor(self.context)()
        formatted_provenances = []
        for provenance in provenances:
            title = provenance.get('title', '')
            link = provenance.get('link', '')
            owner = provenance.get('owner', '')
            if title != '' or owner != '' or link != '':
                formatted_provenance = {'source':{}, 'owner':{}}
                formatted_provenance['source']['title'] = title
                formatted_provenance['source']['url'] = link

                if owner != '':
                    if hasVocab:
                        owner_title = tmpOrganisationsVocabulary.\
                            getDisplayList(self.context).getValue(owner)
                    else:
                        owner_title = owner
                    formatted_provenance['owner']['title'] = owner_title
                    parser = urlparse.urlparse(owner)
                    if all((parser.scheme, parser.netloc)):
                        formatted_provenance['owner']['url'] = owner
                    else:
                        formatted_provenance['owner']['url'] = link
                formatted_provenances.append(formatted_provenance)

        self.info['provenances'] = formatted_provenances
        return self.info
Пример #34
0
    def get_document(self, url):
        """
            Connects to the server and retrieves the document
        """
        set_status(_('Contacting SomaFM server...'))
        hostinfo = urlparse.urlparse(url)

        try:
            c = httplib.HTTPConnection(hostinfo.netloc, timeout = 20)
        except TypeError:
            c = httplib.HTTPConnection(hostinfo.netloc)

        try:
            c.request('GET', hostinfo.path, headers={'User-Agent':
                    self.user_agent})
            response = c.getresponse()
        except (socket.timeout, socket.error):
            raise radio.RadioException(_('Error connecting to SomaFM server.'))

        if response.status != 200:
            raise radio.RadioException(_('Error connecting to SomaFM server.'))

        document = response.read()
        c.close()

        set_status('')
        return document
Пример #35
0
def html2rest(html,
              writer=sys.stdout,
              encoding='utf8',
              relto=None,
              preprocess=None,
              wrap_width=80,
              nowrap=False,
              embedded_uri=False):
    relroot = relpath = None
    if relto:
        parsed = urlparse.urlparse(relto)
        relroot = parsed.scheme + '://' + parsed.netloc
        relpath = relroot + parsed.path
        if relpath[-1] != '/':
            relpath += '/'
    if preprocess:
        html = preprocess(html, encoding=encoding)
    parser = Parser(writer,
                    encoding,
                    relroot,
                    relpath,
                    wrap_width=wrap_width,
                    nowrap=nowrap,
                    embedded_uri=embedded_uri)
    #parser.feed(readsoup(html))
    parser.feed(html.decode(encoding))
    parser.close()
Пример #36
0
	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
		self.parserobots = parserobots
		self.output 	= output
		self.report 	= report
		self.domain 	= domain
		self.exclude 	= exclude
		self.skipext 	= skipext
		self.drop		= drop
		self.debug		= debug

		if self.debug:
			logging.basicConfig(level=logging.DEBUG)

		self.tocrawl = set([domain])

		try:
			self.target_domain = urlparse.urlparse(domain)[1]
		except:
			raise ValueError("Invalid domain")


		if self.output:
			try:
				self.output_file = open(self.output, 'w')
			except:
				logging.debug ("Output file not available.")
				exit(255)
Пример #37
0
    def check_config():
        """
        Check crucial configuration details for existence and workability.

        Runs checks to see whether bugtracker's URL is reachable, whether
        backend is available at the right filename, and whether the script has
        the key arguments it needs to run: URL, backend, and database details.

        The filename for the backend in the backends/ directory needs to be the
        same as the configuration argument specifying that backend. For
        instance, invoking the Launchpad backend uses 'lp', and so the filename
        is 'lp.py'.
        """
        Config.check_params(['url', 'backend'])

        if Config.backend + ".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "' + Config.backend +
                                '" does not exist')

        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
        print("Checking URL: " + check_url)
        req = Request(check_url)
        try:
            response = urlopen(req)
        except HTTPError, e:
            raise InvalidConfig('The server could not fulfill the request ' +
                                str(e.msg) + '(' + str(e.code) + ')')
Пример #38
0
 def __addPost(self, post, is_question = False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key_tag = post.find('a', id=re.compile('postcount\d+'))
         #unique_key = self.__removeSessionId('http://htcpedia.com/forum/' + unique_key_tag['href'])
         unique_key = unique_key_tag['id']
         log.info(unique_key)
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [self.task.instance_data['uri'], unique_key]
             page['uri'] = self.currenturi + "#" + unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
Пример #39
0
def process_waze_message(token):
    connection=httplib.HTTPConnection("waze.to")
    connection.request("GET","/"+token)
    response=connection.getresponse()
    meetingUrl=response.getheader("location")
    logging.debug(meetingUrl)

    querystring=urlparse.urlparse(meetingUrl).query
    token= urlparse.parse_qs(querystring)["token"][0]
    logging.debug(token)

    meetingInfoUrl="http://mobile-web.waze.com/SocialMediaServer/internal/getMeetingInfo?event_id="+ token
    logging.debug(meetingInfoUrl)
    meetingInfo=json.loads(urllib2.urlopen(meetingInfoUrl).read())
    logging.debug(meetingInfo)

    driveUrl="http://mobile-web.waze.com/rtserver/web/PickUpGetDriverInfo?clientID=70a8b694c7&routeTimestamp=0&getUserInfo=true&token=" + token
    logging.debug(driveUrl)
    driverInfo=json.loads(urllib2.urlopen(driveUrl).read())
    logging.debug(driverInfo)
    if driverInfo['status']=='ok':
        eta = driverInfo['eta']
        insert_waze_request(token,eta)
    else:
        deactivate_waze_request(token)
Пример #40
0
 def clean(self, value):
     if not isinstance(value, basestring):
         self.raise_config_error("is not a URL string.")
     # URLs must be bytes, not unicode.
     if isinstance(value, unicode):
         value = value.encode('utf-8')
     return urlparse.urlparse(value)
Пример #41
0
    def __addPost(self, post):
        '''It will add the post
        '''
        try:
            
            page = self.__getData(post)
            if not page:
                return True
            unique_key  = get_hash( {'data' : page['data'] })
            if checkSessionInfo('review', self.session_info_out, unique_key,\
                         self.task.instance_data.get('update'),parent_list\
                                            = [self.currenturi]):
                log.info(self.log_msg('Session info returns True'))
                return False

            result=updateSessionInfo('review', self.session_info_out, unique_key, \
                get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                parent_list=[self.currenturi])
            if not result['updated']:
                log.info(self.log_msg('Update session info returns False'))
                return True
            page['path'] = [self.currenturi] 
            page['parent_path'] = []
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page['entity'] = 'post'
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Post Added'))
            return True
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
Пример #42
0
    def check_config():
        """
        Check crucial configuration details for existence and workability.

        Runs checks to see whether bugtracker's URL is reachable, whether
        backend is available at the right filename, and whether the script has
        the key arguments it needs to run: URL, backend, and database details.

        The filename for the backend in the backends/ directory needs to be the
        same as the configuration argument specifying that backend. For
        instance, invoking the Launchpad backend uses 'lp', and so the filename
        is 'lp.py'.
        """
        Config.check_params(['url', 'backend'])

        if Config.backend + ".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "' + Config.backend + '" does not exist')

        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
        print("Checking URL: " + check_url)
        req = Request(check_url)

        if Config.backend != 'github':
            try:
                response = urlopen(req)
            except HTTPError, e:
                raise InvalidConfig('The server could not fulfill the request '
                                    + str(e.msg) + '(' + str(e.code) + ')')
            except URLError, e:
                raise InvalidConfig('We failed to reach a server. ' + str(e.reason))
Пример #43
0
 def __addPost(self, post):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         page = self.__getData(post)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         unique_key = get_hash(page)
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = self.currenturi
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
Пример #44
0
    def generate_cookie(self, url_path, session_id, expiration=None, add_header=False):
        '''
        Return a session cookie containing the session id. The cookie
        will be contrainted to the url path, defined for use
        with HTTP only, and only returned on secure connections (SSL).

        :parameters:
          url_path
            The cookie will be returned in a request if it begins
            with this url path.
          session_id
            The session id identified by the session cookie
          add_header
            If true format cookie string with Set-Cookie: header

        :returns:
          cookie string
        '''

        if not expiration:      # Catch zero unix timestamps
            expiration = None;

        cookie = Cookie(self.session_cookie_name, session_id,
                        domain=urlparse.urlparse(api.env.xmlrpc_uri).netloc,
                        path=url_path, httponly=True, secure=True,
                        expires=expiration)
        if add_header:
            result = 'Set-Cookie: %s' % cookie
        else:
            result = str(cookie)

        return result
Пример #45
0
def test_compare_triples():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(
                    urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            both, first, second = graph_diff(g_fdp, g_dump)
            n_first = len(first)
            # n_second = len(second)
            # n_both = len(both)

            assert_equals(
                n_first, 0,
                '{} triple(s) different from reference:\n\n{}===\n{}\n'.format(
                    n_first, first.serialize(format='turtle'),
                    second.serialize(format='turtle')))
Пример #46
0
    def __call__(self, text="", match=""):
        match = match.replace('https://', 'http://')
        soup = BeautifulSoup(text)

        items = set()
        for link in soup.find_all('a'):
            href = link.get('href')
            if href is None:
                continue
            href = href.replace('https://', 'http://')

            ourl = urlparse.urlparse(href)
            ourl = ourl._replace(fragment='', query='', params='')
            href = ourl.geturl()

            if u'resolveuid' in href:
                items.add(href)
            elif href.startswith('../'):
                items.add(href)
            elif match and href.startswith(match):
                found = href[len(match):]
                found = found.strip('/')
                items.add(found)
            elif href.startswith('/'):
                href = href.strip('/')
                items.add(href)

        for link in items:
            yield {
                'count': 1,
                'type': 'Link',
                'text': link,
                'relevance': '100.0'
            }
Пример #47
0
def check_link(url):
    '''
    Checks a link whose URL starts with 'http'.

    Ignores links that start with:
    * https://forums.aws.amazon.com
    because you have to be signed in to the forum for the link to be valid.

    Uses urllib2 to parse the URL and check that it is valid.

    @returns True if the link is valid, False otherwise.
    '''
    logger.debug("Checking {}".format(url))
    if re.match(r'https://forums\.aws\.amazon\.com/', url):
        return True
    try:
        if not urlparse.urlparse(url).netloc:
            return False

        website = urlopen(url)
        html = website.read()

        if website.code != 200:
            return False
    except Exception, e:
        logger.exception("")
        return False
Пример #48
0
def is_file(name):
    try:
        result = urlparse.urlparse(name)
    except Exception:
        return False
    else:
        return result.scheme == 'file'
Пример #49
0
def is_url(name):
    try:
        result = urlparse.urlparse(name)
    except Exception:
        return False
    else:
        return result.scheme in ('http', 'https', 'file', 'ftp')
Пример #50
0
def test_compare_triples():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            both, first, second = graph_diff(g_fdp, g_dump)
            n_first = len(first)
            # n_second = len(second)
            # n_both = len(both)

            assert_equals(
               n_first, 0, '{} triple(s) different from reference:\n\n{}===\n{}\n'.format(
                  n_first, first.serialize(format='turtle'), second.serialize(format='turtle')))
Пример #51
0
    def cookies(self):
        """ Cookies
        """
        if self._cookies is None:
            ac_cookie = self.request.cookies.get('__ac', None)
            if not ac_cookie:
                self._cookies = {}
                return self._cookies

            ## XXX There is a bug with wkhtmltopdf --cookie param
            ## Thus we'll use --cookie-jar
            ## EEA ticket #21958. wkhtmltopdf tickets #1870, #1903

            url = urlparse.urlparse(self.context.absolute_url())
            domain = url.hostname

            # Also allow CDN resources
            if domain.startswith(u"www."):
                domain = domain.replace(u"www.", u".", 1)

            cookie = u"__ac={cookie}; domain={domain}; path=/;".format(
                cookie=ac_cookie,
                domain=domain
            )

            with tempfile.NamedTemporaryFile(
                    prefix='eea.converter.', suffix='.cookie.jar',
                    dir=TMPDIR(), delete=False) as ofile:
                ofile.write(cookie)
                self._cookies = ofile.name
        return self._cookies
Пример #52
0
    def cookies(self):
        """ Cookies
        """
        if self._cookies is None:
            ac_cookie = self.request.cookies.get('__ac', None)
            if not ac_cookie:
                self._cookies = {}
                return self._cookies

            ## There is a bug with wkhtmltopdf --cookie param
            ## Thus we'll use --cookie-jar
            ## EEA ticket #21958. wkhtmltopdf tickets #1870, #1903

            url = urlparse.urlparse(self.context.absolute_url())
            domain = url.hostname

            # Also allow CDN resources
            if domain.startswith(u"www."):
                domain = domain.replace(u"www.", u".", 1)

            cookie = u"__ac={cookie}; domain={domain}; path=/;".format(
                cookie=ac_cookie, domain=domain)

            with tempfile.NamedTemporaryFile(prefix='eea.converter.',
                                             suffix='.cookie.jar',
                                             dir=TMPDIR(),
                                             delete=False) as ofile:
                ofile.write(cookie)
                self._cookies = ofile.name
        return self._cookies
Пример #53
0
def test_compare_triple_counts():
    for mime, fext in MIME_TYPES.items():
        dump_path = path.join(DUMP_DIR, path.basename(mime))

        for url in URLs:
            if six.PY2:
                fname = '%s.%s' % (path.basename(
                    urlparse.urlparse(url).path), fext)
            else:
                fname = '%s.%s' % (path.basename(urlparse(url).path), fext)

            fname = path.join(dump_path, fname)

            req = Request(url)
            req.add_header('Accept', mime)
            res = urlopen(req)

            g_fdp.parse(data=res.read(), format=mime)
            g_dump.parse(fname, format=mime)

            # triple counts
            nt_fdp, nt_dump = len(g_fdp), len(g_dump)
            assert_equals(
                nt_fdp, nt_dump,
                'Triple counts differ: %d (FDP) vs. %d (ref)' %
                (nt_fdp, nt_dump))
Пример #54
0
    def extract_images(self, soup, tempdir):
        index = 0
        for img in soup.body.findAll('img'):
            if not img.get('src'):
                img['src'] = ''
                continue
            img['src'] = self.convert_virtual_url(img['src'])
            width = 0
            height = 0
            img_data = None
            if img['src'].startswith('//'):
                img['src'] = 'http:' + img['src']
            # local image
            elif img['src'].startswith('/'):
                img_data, width, height = \
                    self.get_local_image(img['src'])
            # remote image
            if img['src'].startswith('http'):
                _, remote_server, img_path, _, _, _ = urlparse.urlparse(
                    img['src'])
                try:
                    conn = httplib.HTTPConnection(remote_server)
                    conn.request('GET', img_path)
                    resp = conn.getresponse()
                    if not resp.status == 200:
                        logger.warn('Could not get image {0}: {1} {2}'.format(
                            img['src'], resp.status, resp.reason))
                    else:
                        img_data = resp.read()
                except Exception as e:
                    logger.warn('Error getting remote image {0}: {1}'.format(
                        img['src'], e))
                conn.close()

            if not img_data:
                continue

            img_orig_id = img['src'].split('/')[-1]
            img_ext = img_orig_id.split('.')[-1]
            if img_ext == img_orig_id:
                img_ext = 'dat'
            img_id = 'image%d.%s' % (index, img_ext)
            index += 1
            img_file = open(os.path.join(tempdir, img_id), 'wb')
            img_file.write(img_data)
            img_file.close()
            if 'width' not in img or 'height' not in img:
                try:
                    img_obj = Image.open(img_file.name)
                    width, height = img_obj.size
                except Exception as e:
                    logger.warn('Could not get image size for {0}: {1}'.format(
                        img['src'], e))
            if width and 'width' not in img:
                img['width'] = width
            if height and 'height' not in img:
                img['height'] = height
            img['src'] = os.path.join('images', img_id)
        return soup
Пример #55
0
 def make_connection(self, host):
     self.user_pass, self.realhost = splituser(host)
     proto, proxy, p1, p2, p3, p4 = urlparse.urlparse(
         self.proxies.get('http', ''))
     if proxy and not self.local:
         return httplib.HTTP(proxy)
     else:
         return httplib.HTTP(self.realhost)
Пример #56
0
def _convert_to_text(link):
    parsed = urlparse.urlparse(link.url)
    site = parsed[1]
    rest = ' '.join(re.split(r'[/.-_]', parsed[2]))
    data = '%s %s %s user*%s topic:%s %s' % (site, rest, link.text, link.user.username, link.topic.name, link.topic.full_name)
    data = data.replace("'", "*")
    data = data.replace("%", "*")
    return data
Пример #57
0
def open_sink(url, encode=True):
    parsed = urlparse.urlparse(url)
    if parsed.scheme and len(parsed.scheme)>0 and parsed.scheme[0] == "z":
        import zcom
        return zcom.Connection(url, codec=encode)
    else:
        stream = gopen(url, "wb")
        return tarrecords.TarWriter(stream, encode=encode)
Пример #58
0
def main():
    #argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        help="The host to scrape",
                        action='store',
                        dest='host',
                        required=True)
    parser.add_argument('-o',
                        help="Output file for email addresses",
                        action='store',
                        dest='outfile')
    parser.add_argument('-n',
                        help="Number of email addresses to collect",
                        action='store',
                        dest='ncollect',
                        default=20)
    parser.add_argument('-v',
                        help="verbose output",
                        action='store_true',
                        dest='verbose')
    parser.add_argument('-w',
                        help="wait time between requests",
                        action='store',
                        dest='wait')
    args = parser.parse_args()

    #signal setup with globals
    signal(SIGINT, sig_handle)

    if args.ncollect is None:
        args.ncollect = 20
    if args.wait is None:
        args.wait = 0

    #setup output file name
    if not args.outfile is None:
        outfile = args.outfile
    else:
        outfile = "{}.emails.txt".format(urlparse.urlparse(args.host).netloc)
    emails = []

    #check verbose
    if args.verbose:
        global verbose
        verbose = True

    #scrape, if the first scrape fails, the link is invalid
    if (scrape(args.host, int(args.ncollect), [], emails, int(args.wait),
               True)):
        with open(outfile, "a") as file:  #save emails when finished
            for email in emails:
                file.write(email)
                file.write("\n")
    else:
        print("Link read failed: {}".format(args.host))

    return 0
Пример #59
0
    def _populate_hosts_and_request_paths(self):
        """
        Rackspace uses a separate host for API calls which is only provided
        after an initial authentication request. If we haven't made that
        request yet, do it here. Otherwise, just return the management host.
        """
        if not self.auth_token:
            # Initial connection used for authentication
            conn = self.conn_classes[self.secure](self.auth_host,
                                                  self.port[self.secure])
            conn.request(method='GET',
                         url='/%s' % (AUTH_API_VERSION),
                         headers={
                             'X-Auth-User': self.user_id,
                             'X-Auth-Key': self.key
                         })

            resp = conn.getresponse()

            if resp.status == httplib.NO_CONTENT:
                # HTTP NO CONTENT (204): auth successful
                headers = dict(resp.getheaders())

                try:
                    self.server_url = headers['x-server-management-url']
                    self.storage_url = headers['x-storage-url']
                    self.cdn_management_url = headers['x-cdn-management-url']
                    self.lb_url = self.server_url.replace(
                        "servers", "ord.loadbalancers")
                    self.auth_token = headers['x-auth-token']
                except KeyError, e:
                    # Returned 204 but has missing information in the header, something is wrong
                    raise MalformedResponseError('Malformed response',
                                                 body='Missing header: %s' %
                                                 (str(e)),
                                                 driver=self.driver)
            elif resp.status == httplib.UNAUTHORIZED:
                # HTTP UNAUTHORIZED (401): auth failed
                raise InvalidCredsError()
            else:
                # Any response code != 401 or 204, something is wrong
                raise MalformedResponseError(
                    'Malformed response',
                    body='code: %s body:%s' %
                    (resp.status, ''.join(resp.body.readlines())),
                    driver=self.driver)

            for key in [
                    'server_url', 'storage_url', 'cdn_management_url', 'lb_url'
            ]:
                scheme, server, request_path, param, query, fragment = (
                    urlparse.urlparse(getattr(self, key)))
                # Set host to where we want to make further requests to
                setattr(self, '__%s' % (key), server)
                setattr(self, '__request_path_%s' % (key), request_path)

            conn.close()