def parse_products(self, response):
        print "parse_products", response.url
        sel = Selector(response)
        breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]')
        categories = [span for span in breadcrumb.xpath(".//span[@itemprop='title']/text()").extract()[1:]]
        categories.append(breadcrumb.xpath(".//span/text()").extract()[-1])
        print categories
        
        for product in sel.xpath('//div[contains(@id,"quickLookItem")]'):
            # check if it is a multistore product
            if product.xpath('.//span[contains(@id, "numStoresQA")]'):
                print product.xpath(".//a/@href").extract()[0]
                url = product.xpath(".//a/@href").extract()[0]
                url = "/".join(url.split("/")[:-1])+"/prices"
                yield Request(urlparse.urljoin(response.url, url), callback=self.parse_multiple_store_product)
            else:
                # It is not a multistore product. Parse it.
                item = ShoppingdotcomItem()
                item["categories"] = categories
                item["product_name"] = product.xpath(".//span[contains(@id, 'nameQA')]/@title").extract()[0]
                if product.xpath(".//span[@class='placeholderImg']").extract():
                    item["image_urls"] = product.xpath(".//span[@class='placeholderImg']/text()").extract()
                else:
                    item["image_urls"] = product.xpath(".//div[@class='gridItemTop']//img/@src").extract()
                item["product_urls"] = [urlparse.urljoin(response.url, product.xpath(".//a/@href").extract()[0])]
                item["stores"] = product.xpath(".//a[@class='newMerchantName']/text()").extract()
                item["prices"] = [price.replace("\n","") for price in product.xpath(".//span[@class='productPrice']/a/text()").extract()]
                yield item

        # Check if Next page link is there then yeild request with next URL
        if sel.xpath("//a[@name='PLN']").extract():
            yield Request(urlparse.urljoin(response.url, sel.xpath("//a[@name='PLN']/@href").extract()[0]), self.parse_products)
            pass
 def parse_start_url(self, response):
     print response.url
     sel = Selector(response)
     
     for url in sel.xpath("//a"):
         #print url.xpath("@href").extract()
         href = url.xpath("@href").extract()[0] if url.xpath("@href").extract() else None
         if href and href.split("/")[-1] == "products":
             yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products)
         if href and href.find("xFA-") >= 0:
             href = href.replace("xFA-", "").split("~")[0]+"/products"
             yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products)
         pass
    def parse_products(self, response):
        print "parse_products", response.url
        sel = Selector(response)
        breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]')
        categories = [
            span for span in breadcrumb.xpath(
                ".//span[@itemprop='title']/text()").extract()[1:]
        ]
        categories.append(breadcrumb.xpath(".//span/text()").extract()[-1])
        print categories

        for product in sel.xpath('//div[contains(@id,"quickLookItem")]'):
            # check if it is a multistore product
            if product.xpath('.//span[contains(@id, "numStoresQA")]'):
                print product.xpath(".//a/@href").extract()[0]
                url = product.xpath(".//a/@href").extract()[0]
                url = "/".join(url.split("/")[:-1]) + "/prices"
                yield Request(urlparse.urljoin(response.url, url),
                              callback=self.parse_multiple_store_product)
            else:
                # It is not a multistore product. Parse it.
                item = ShoppingdotcomItem()
                item["categories"] = categories
                item["product_name"] = product.xpath(
                    ".//span[contains(@id, 'nameQA')]/@title").extract()[0]
                if product.xpath(".//span[@class='placeholderImg']").extract():
                    item["image_urls"] = product.xpath(
                        ".//span[@class='placeholderImg']/text()").extract()
                else:
                    item["image_urls"] = product.xpath(
                        ".//div[@class='gridItemTop']//img/@src").extract()
                item["product_urls"] = [
                    urlparse.urljoin(response.url,
                                     product.xpath(".//a/@href").extract()[0])
                ]
                item["stores"] = product.xpath(
                    ".//a[@class='newMerchantName']/text()").extract()
                item["prices"] = [
                    price.replace("\n", "") for price in product.xpath(
                        ".//span[@class='productPrice']/a/text()").extract()
                ]
                yield item

        # Check if Next page link is there then yeild request with next URL
        if sel.xpath("//a[@name='PLN']").extract():
            yield Request(
                urlparse.urljoin(
                    response.url,
                    sel.xpath("//a[@name='PLN']/@href").extract()[0]),
                self.parse_products)
            pass
Exemplo n.º 4
0
def main(argv=sys.argv):
    """ Punto de entrada al programa """
    url = "http://www.vientonomade.com.ar/index.php?option=com_content&view=category&" "layout=blog&id=8&Itemid=10"
    fetcher = httplib2.Http()
    get = partial(obtener_pagina, fetcher)

    while url:
        html = get(url)
        uri, links = buscar_links(html)
        for link in links:
            try:
                print urlparse.urljoin(url, link)
            except UnicodeEncodeError:
                pass
        url = uri and urlparse.urljoin(url, uri) or None
    def parse_start_url(self, response):
        print response.url
        sel = Selector(response)

        for url in sel.xpath("//a"):
            #print url.xpath("@href").extract()
            href = url.xpath("@href").extract()[0] if url.xpath(
                "@href").extract() else None
            if href and href.split("/")[-1] == "products":
                yield Request(urlparse.urljoin(response.url, href),
                              callback=self.parse_products)
            if href and href.find("xFA-") >= 0:
                href = href.replace("xFA-", "").split("~")[0] + "/products"
                yield Request(urlparse.urljoin(response.url, href),
                              callback=self.parse_products)
            pass
Exemplo n.º 6
0
def parse(url, body, **kwargs):
    for line in body.decode('gbk', errors='ignore').splitlines():
        if line.lstrip().startswith('var docData'):
            l, r = line.find('{'), line.rfind('}')
            obj = json.loads(line[l:r + 1])
            doc = obj['result']['docinfo'][0]['foolrinfo']
            doc['title'] = obj['result']['sDocTitle']
            doc['url'] = urlparse.urljoin('http://www.xici.net',
                                          obj['result']['strPageUrl'])
            doc['date'] = '20' + doc['LongDate']
            doc['content'] = html.fromstring(
                doc['floorcontent']).text_content()

            tpl = Template('''
                <html>
                <head>
                    <meta content="text/html; charset=utf-8" http-equiv="content-type">
                    <title>{{doc['title']}}</title>
                </head>
                <body>
                    <a id="title" href="{{doc['url']}}">{{doc['title']}}</a>
                    <p id="date">{{doc['date']}}</p>
                    <div id="content">{{doc['content']}}</div>
                </body>
                </html>''')

            return tpl.render(doc=doc).encode('gbk', errors='ignore')
    else:
        return '<html/>'
Exemplo n.º 7
0
    def submit(self, opener, res):
        """submit WAYF form with IDP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info('Submitting form to wayf')
        #Set IDP to correct IDP
        wayf_data = {}
        idp = self.idp
        data = self.data
        if not idp.get_idp() in data['origin']:
            raise WAYFException(
                "Can't find IdP '{0}' in WAYF's IdP list".format(
                    idp.get_idp()))
        wayf_data['origin'] = data['origin'][idp.get_idp()]
        wayf_data['shire'] = data['shire']['value']
        wayf_data['providerId'] = data['providerId']['value']
        wayf_data['target'] = data['target']['value']
        wayf_data['time'] = data['time']['value']
        wayf_data['cache'] = 'false'
        wayf_data['action'] = 'selection'
        url = urlparse.urljoin(res.url, data['form']['action'])
        data = urllib.urlencode(wayf_data)
        request = Request(url + '?' + data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 8
0
def parse(url, body, **kwargs):
    for line in body.decode('gbk', errors='ignore').splitlines():
        if line.lstrip().startswith('var docData'):
            l, r = line.find('{'), line.rfind('}')
            obj = json.loads(line[l:r+1])
            doc = obj['result']['docinfo'][0]['foolrinfo']
            doc['title'] = obj['result']['sDocTitle']
            doc['url'] = urlparse.urljoin('http://www.xici.net', obj['result']['strPageUrl'])
            doc['date'] = '20'+doc['LongDate']
            doc['content'] = html.fromstring(doc['floorcontent']).text_content()

            tpl = Template('''
                <html>
                <head>
                    <meta content="text/html; charset=utf-8" http-equiv="content-type">
                    <title>{{doc['title']}}</title>
                </head>
                <body>
                    <a id="title" href="{{doc['url']}}">{{doc['title']}}</a>
                    <p id="date">{{doc['date']}}</p>
                    <div id="content">{{doc['content']}}</div>
                </body>
                </html>''')

            return tpl.render(doc=doc).encode('gbk', errors='ignore')
    else:
        return '<html/>'
Exemplo n.º 9
0
 def handle_starttag(self, tag, attrs):
     if tag == 'a':
         for (attribute, value) in attrs:
             if (attribute == 'href'):
                 # if not full url, convert relative url to full url
                 url = urlparse.urljoin(self.base_url, value)
                 self.links.add(url)
Exemplo n.º 10
0
    def parse(self, response):
        delinquent_link = Selector(response).xpath(
            '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract()
        urllib.urlretrieve(urlparse.urljoin(response.url, delinquent_link[0]), 'delinquent.zip')
        unzip('delinquent.zip', 'delinquent')

        with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',')
            for idx, column in enumerate(csvreader.next()):
                column = re.sub('["]', "", column).strip()
                if column.startswith("PARCELID"):
                    parcelidcol = idx
                if column.startswith("OWNERNAME1"):
                    ownernamecol = idx
                if column.startswith("PARCELLOCATION"):
                    parcellocationcol = idx
                if column.startswith("CLS"):
                    parcelclass = idx
                if column.startswith("ASMTBLDG"):
                    buildingvalue = idx
            for row in csvreader:
                item = ReapItem()
                item['parcel_id'] = re.sub('["]', "", row[parcelidcol]).strip()
                item['parcel_location'] = row[parcellocationcol].strip()
                item['parcel_class'] = row[parcelclass].strip()
                item['building_value'] = row[buildingvalue].strip()
                request = scrapy.Request(
                    "http://mctreas.org/master.cfm?parid={0}&taxyr={1}&own1={2}".format(
                        item['parcel_id'], str(YEAR), row[ownernamecol]),
                    callback=self.get_tax_eligibility)
                request.meta['item'] = item
                yield request
Exemplo n.º 11
0
def install_artifacts(artifacts, dirstruct, installdir, basestaticurl):
    """
    Install the artifacts.
    """
    assert basestaticurl.endswith("/"), "Basestaticurl should end with /"
    installed = []
    for reldir, artifactnames in dirstruct.items():
        destdir = os.path.join(installdir, reldir)
        if not os.path.exists(destdir):
            log.warn(msg="Making install directory %s" % destdir)
            os.makedirs(destdir)
        else:
            assert os.path.isdir(destdir)
        for artifactname in artifactnames:
            destpath = os.path.abspath(os.path.join(destdir, artifactname))
            if artifactname in artifacts.keys():
                # The artifact must be loaded from jenkins
                theartifact = artifacts[artifactname]
            else:
                # It's probably a static file, we can get it from the static collection
                staticurl = urlparse.urljoin(basestaticurl, artifactname)
                theartifact = Artifact(artifactname, staticurl)
            theartifact.save(destpath)
            installed.append(destpath)
    return installed
Exemplo n.º 12
0
def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data
Exemplo n.º 13
0
    def submit(self, opener, res):
        """submit login form to COSign IdP

        :param opener: the urllib2 opener
        :param data: the form data
           as a dictionary :param res: the response object :param cm: a
           :class:`~slick.passmgr.CredentialManager` containing the URL
           to the service provider you want to connect to

        """
        idp_data = {}
        cm = self.cm
        data = self.data
        url = urlparse.urljoin(res.url, data["form"]["action"])
        log.info("Form Authentication from: %s" % url)
        idp_data[self.username_field] = cm.get_username()
        idp_data[self.password_field] = cm.get_password()
        idp_data["service"] = data["service"]["value"]
        idp_data["ref"] = data["ref"]["value"]
        data = urllib.urlencode(idp_data)
        request = Request(url, data=data)
        log.info("Submitting login form")
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 14
0
    def submit(self, opener, res):
        """submit WAYF form with IDP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info("Submitting form to wayf")
        # Set IDP to correct IDP
        wayf_data = {}
        idp = self.idp
        data = self.data
        if not idp.get_idp() in data["origin"]:
            raise WAYFException("Can't find IdP '{0}' in WAYF's IdP list".format(idp.get_idp()))
        wayf_data["origin"] = data["origin"][idp.get_idp()]
        wayf_data["shire"] = data["shire"]["value"]
        wayf_data["providerId"] = data["providerId"]["value"]
        wayf_data["target"] = data["target"]["value"]
        wayf_data["time"] = data["time"]["value"]
        wayf_data["cache"] = "false"
        wayf_data["action"] = "selection"
        url = urlparse.urljoin(res.url, data["form"]["action"])
        data = urllib.urlencode(wayf_data)
        request = Request(url + "?" + data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 15
0
    def submit(self, opener, res):
        """submit login form to IdP

        :param opener: the urllib2 opener
        :param data: the form data
           as a dictionary :param res: the response object :param cm: a
           :class:`~slick.passmgr.CredentialManager` containing the URL
           to the service provider you want to connect to

        """
        idp_data = {}
        cm = self.cm
        data = self.data

        # insert the hidden fields into the post data
        for k, v in data.items():
            if "type" in v and "value" in v:
                if v.get("type") == "hidden":
                    idp_data[k] = v.get("value")

        url = urlparse.urljoin(res.url, data["form"]["action"])
        log.info("Form Authentication from: %s" % url)
        idp_data[self.username_field] = cm.get_username()
        idp_data[self.password_field] = cm.get_password()
        data = urllib.urlencode(idp_data)
        request = Request(url, data=data)
        log.info("Submitting login form")
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 16
0
    def parse(self, response):
        delinquent_link = Selector(response).xpath(
            '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract()
        urllib.urlretrieve(urlparse.urljoin(response.url, delinquent_link[0]),
                           'delinquent.zip')
        unzip('delinquent.zip', 'delinquent')

        with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',')
            for idx, column in enumerate(csvreader.next()):
                column = re.sub('["]', "", column).strip()
                if column.startswith("PARCELID"):
                    parcelidcol = idx
                if column.startswith("OWNERNAME1"):
                    ownernamecol = idx
                if column.startswith("PARCELLOCATION"):
                    parcellocationcol = idx
                if column.startswith("CLS"):
                    parcelclass = idx
                if column.startswith("ASMTBLDG"):
                    buildingvalue = idx
            for row in csvreader:
                item = ReapItem()
                item['parcel_id'] = re.sub('["]', "", row[parcelidcol]).strip()
                item['parcel_location'] = row[parcellocationcol].strip()
                item['parcel_class'] = row[parcelclass].strip()
                item['building_value'] = row[buildingvalue].strip()
                request = scrapy.Request(
                    "http://mctreas.org/master.cfm?parid={0}&taxyr={1}&own1={2}"
                    .format(item['parcel_id'], str(YEAR), row[ownernamecol]),
                    callback=self.get_tax_eligibility)
                request.meta['item'] = item
                yield request
Exemplo n.º 17
0
def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data
Exemplo n.º 18
0
    def parse(self, response):

        delinquentLink = Selector(response).xpath(
            '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract()
        urllib.urlretrieve(urlparse.urljoin(response.url, delinquentLink[0]),
                           'delinquent.zip')
        unzip('delinquent.zip', 'delinquent')

        with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',')
            for idx, column in enumerate(csvreader.next()):
                column = re.sub('["]', "", column).strip()
                if column.startswith("PARCELID"):
                    parcelidcol = idx
                if column.startswith("OWNERNAME1"):
                    ownernamecol = idx
                if column.startswith("PARCELLOCATION"):
                    parcellocationcol = idx
            for row in csvreader:
                item = ReapItem()
                item['parcelid'] = re.sub('["]', "", row[parcelidcol]).strip()
                item['parcellocation'] = row[parcellocationcol].strip()
                if item['parcelid'].startswith('R72'):
                    request = scrapy.Request(
                        "http://mctreas.org/master.cfm?parid=" +
                        item['parcelid'] + "&taxyr=2014" + "&own1=" +
                        row[ownernamecol] + '\n',
                        callback=self.getTaxEligibility)
                    request.meta['item'] = item
                    yield request
Exemplo n.º 19
0
def install_artifacts(artifacts, dirstruct, installdir, basestaticurl):
    """
    Install the artifacts.
    """
    assert basestaticurl.endswith("/"), "Basestaticurl should end with /"
    installed = []
    for reldir, artifactnames in dirstruct.items():
        destdir = os.path.join(installdir, reldir)
        if not os.path.exists(destdir):
            log.warn(msg="Making install directory %s" % destdir)
            os.makedirs(destdir)
        else:
            assert os.path.isdir(destdir)
        for artifactname in artifactnames:
            destpath = os.path.abspath(os.path.join(destdir, artifactname))
            if artifactname in artifacts.keys():
                # The artifact must be loaded from jenkins
                theartifact = artifacts[artifactname]
            else:
                # It's probably a static file, we can get it from the static collection
                staticurl = urlparse.urljoin(basestaticurl, artifactname)
                theartifact = Artifact(artifactname, staticurl)
            theartifact.save(destpath)
            installed.append(destpath)
    return installed
Exemplo n.º 20
0
    def check_config():
        """
        Check crucial configuration details for existence and workability.

        Runs checks to see whether bugtracker's URL is reachable, whether
        backend is available at the right filename, and whether the script has
        the key arguments it needs to run: URL, backend, and database details.

        The filename for the backend in the backends/ directory needs to be the
        same as the configuration argument specifying that backend. For
        instance, invoking the Launchpad backend uses 'lp', and so the filename
        is 'lp.py'.
        """
        Config.check_params(['url', 'backend'])

        if Config.backend + ".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "' + Config.backend +
                                '" does not exist')

        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
        print("Checking URL: " + check_url)
        req = Request(check_url)
        try:
            response = urlopen(req)
        except HTTPError, e:
            raise InvalidConfig('The server could not fulfill the request ' +
                                str(e.msg) + '(' + str(e.code) + ')')
Exemplo n.º 21
0
    def check_config():
        """
        Check crucial configuration details for existence and workability.

        Runs checks to see whether bugtracker's URL is reachable, whether
        backend is available at the right filename, and whether the script has
        the key arguments it needs to run: URL, backend, and database details.

        The filename for the backend in the backends/ directory needs to be the
        same as the configuration argument specifying that backend. For
        instance, invoking the Launchpad backend uses 'lp', and so the filename
        is 'lp.py'.
        """
        Config.check_params(['url', 'backend'])

        if Config.backend + ".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "' + Config.backend + '" does not exist')

        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
        print("Checking URL: " + check_url)
        req = Request(check_url)

        if Config.backend != 'github':
            try:
                response = urlopen(req)
            except HTTPError, e:
                raise InvalidConfig('The server could not fulfill the request '
                                    + str(e.msg) + '(' + str(e.code) + ')')
            except URLError, e:
                raise InvalidConfig('We failed to reach a server. ' + str(e.reason))
Exemplo n.º 22
0
def bot_send_video(gesture, video_url, video_preview_img, to_mid="u2ef38a8c1f3f1c2c63bdf9c0a629023c"):
    
    headers = {}
    headers['Content-type'] = 'application/json; charset=UTF-8'
    headers['X-Line-ChannelID'] = settings.CHANNEL_ID
    headers['X-Line-ChannelSecret'] = settings.CHANNEL_SECRET
    headers['X-Line-Trusted-User-With-ACL'] = settings.CHANNEL_MID

    api = 'https://trialbot-api.line.me/v1/events'

    body = {}
    body['to'] = [to_mid]
    body['toChannel'] = 1383378250
    body['eventType'] = "138311608800106203"

    #gesture = Gesture.objects.all()[0]
    myurl = 'https://eldertranslator.herokuapp.com/'
    video_url = urlparse.urljoin(myurl, gesture.video.url)

    content = {
        "contentType": 3,
        "toType": 1,
        "originalContentUrl": video_url,
        "previewImageUrl": video_preview_img 
    }

    body['content'] = content
    req = requests.post(api, data=json.dumps(body), headers=headers, verify=False)
    
    return req
Exemplo n.º 23
0
    def submit(self, opener, res):
        """submit WAYF form with IDP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info("Submitting form to wayf")
        # Set IDP to correct IDP
        wayf_data = {}
        idp = self.idp
        data = self.data
        idps = {}
        for d in data["user_idp"]:
            if isinstance(data["user_idp"][d], dict):
                idps.update(data["user_idp"][d])
        if not idp.get_idp() in idps:
            raise WAYFException("Can't find IdP '%s' in WAYF's IdP list" % idp)
        wayf_data["user_idp"] = idps[idp.get_idp()]
        wayf_data["Select"] = "Select"
        if data["form"]["action"].startswith("?"):
            urlsp = urlparse.urlsplit(res.url)
            urlsp = urlparse.urlunsplit((urlsp[0], urlsp[1], urlsp[2], "", ""))
            url = res.url + data["form"]["action"]
        else:
            url = urlparse.urljoin(res.url, data["form"]["action"])
        data = urllib.urlencode(wayf_data)
        request = Request(url, data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 24
0
    def notify(cls, alert, *args, **kwargs):
        url = cls._config.get('notifications.hipchat.url')
        token = kwargs.get('token', cls._config.get('notifications.hipchat.token'))
        repeat = kwargs.get('repeat', 0)
        notify = kwargs.get('notify', False)

        color = 'green' if alert and not alert.get('is_alert') else kwargs.get('color', 'red')

        message_text = cls._get_subject(alert, custom_message=kwargs.get('message'))

        if kwargs.get('link', False):
            zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host'))
            alert_id = alert['alert_def']['id']
            alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else ''
            link_text = kwargs.get('link_text', 'go to alert')
            message_text += ' -- <a href="{}" target="_blank">{}</a>'.format(alert_url, link_text)

        message = {
            'message': message_text,
            'color': color,
            'notify': notify
        }

        try:
            logger.info(
                'Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']),
                                                                                   token) + ' ' + json.dumps(message))
            r = requests.post(
                '{}/v2/room/{}/notification'.format(url, urllib.quote(kwargs['room'])),
                json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'})
            r.raise_for_status()
        except:
            logger.exception('Hipchat write failed!')

        return repeat
Exemplo n.º 25
0
def _get_department_urls_from_url(url):
    soup = retrieve_soup(url)
    linked_urls = [
        urlparse.urljoin(BASE_URL, a["href"])
        for a in soup.find_all("a", href=True)
    ]
    return set(linked_url for linked_url in linked_urls
               if _is_department_url(linked_url, url))
Exemplo n.º 26
0
    def notify(cls, alert, *args, **kwargs):

        current_span = extract_span_from_kwargs(**kwargs)

        url = cls._config.get('notifications.hipchat.url')
        token = kwargs.get('token', cls._config.get('notifications.hipchat.token'))
        repeat = kwargs.get('repeat', 0)
        notify = kwargs.get('notify', False)
        alert_def = alert['alert_def']
        message_format = kwargs.get('message_format', 'html')

        current_span.set_tag('alert_id', alert_def['id'])

        entity = alert.get('entity')
        is_changed = alert.get('alert_changed', False)
        is_alert = alert.get('is_alert', False)

        current_span.set_tag('entity', entity['id'])
        current_span.set_tag('alert_changed', bool(is_changed))
        current_span.set_tag('is_alert', is_alert)

        current_span.log_kv({'room': kwargs.get('room')})

        color = 'green' if alert and not alert.get('is_alert') else kwargs.get('color', 'red')

        message_text = cls._get_subject(alert, custom_message=kwargs.get('message'))

        if kwargs.get('link', False):
            zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host'))
            alert_id = alert['alert_def']['id']
            alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else ''
            link_text = kwargs.get('link_text', 'go to alert')
            if message_format == 'html':
                message_text += ' -- <a href="{}" target="_blank">{}</a>'.format(alert_url, link_text)
            else:
                message_text += ' -- {} - {}'.format(link_text, alert_url)

        message = {
            'message': message_text,
            'color': color,
            'notify': notify,
            'message_format': message_format
        }

        try:
            logger.info(
                'Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']),
                                                                                   token) + ' ' + json.dumps(message))
            r = requests.post(
                '{}/v2/room/{}/notification'.format(url, urllib.quote(kwargs['room'])),
                json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'})
            r.raise_for_status()
        except Exception:
            current_span.set_tag('error', True)
            current_span.log_kv({'exception': traceback.format_exc()})
            logger.exception('Hipchat write failed!')

        return repeat
Exemplo n.º 27
0
 def get_new_urls(self, new_url, soup): #获取解析后的新url
     #<a target="_blank" href="/item/%E8%A7%A3%E9%87%8A%E5%99%A8">解释器</a>
     new_urls = set()
     links = soup.find_all('a', href=re.compile(r"/item/"))
     for link in links:
         new_parse_url = link['href']
         new_full_url = urljoin(new_url, new_parse_url)
         new_urls.add(new_full_url)
     return new_urls
Exemplo n.º 28
0
 def relative_to_full_url(self, url):
     """
     Resolve the URL based on the object's original_url
     """
     from urllib2 import urlparse
     parsed = urlparse.urlparse(url)
     if not parsed.netloc:
         url = urlparse.urljoin(self.original_url, parsed.path)
     return url
Exemplo n.º 29
0
 def _kick_offline(self, con, condom):
     form = condom.xpath("//form[@id='form1']")[0]
     kvs,_ = self.process_form(form)
     # javascript:__doPostBack('gvOnLineUser','KickOut$0')
     kvs['__EVENTTARGET'] = 'gvOnLineUser'
     kvs['__EVENTARGUMENT'] = 'KickOut$0'
     action = urlparse.urljoin(con.request.url, form.attrib.get('action'))
     con = self.request_url(action, data=kvs, headers={'Referer': con.request.url})
     return 'Navigate.aspx' in con.request.url
Exemplo n.º 30
0
def relative_to_full_url(original_url, url):
    """
    Resolve the URL based on the original_url
    """
    from urllib2 import urlparse
    parsed = urlparse.urlparse(url)
    if not parsed.netloc:
        url = urlparse.urljoin(original_url, parsed.path)
    return url
Exemplo n.º 31
0
 def show(self):
     slcs_login_url = urlparse.urljoin(self.settings.slcs, 'login')
     idp_keys = list_idps(slcs_login_url).keys()
     idp_keys.sort()
     for i in idp_keys:
         self.idps.append_text(i)
         if i == self.settings.idp:
             self.idps.set_active(len(self.idps.get_model())-1)
     self.window.show_all()
def crawls(url):
    urlStream = urlopen(url)
    htmldoc = urlStream.read()
    soup = BeautifulSoup(htmldoc)
    links = []
    images = soup.findAll("img",
                          {"src": re.compile(r'\.(jpe?g)|(png)|(gif)$')})
    for img in images:
        links.append(urlparse.urljoin(url, img["src"]))
    return links
Exemplo n.º 33
0
def urlIterator(startUrl, nextCssSelector):
    '''Yields the url of a page while there is a next one found by the cssSelector'''
    #This function takes time because it has to parse the dom to get the next url
    url = startUrl
    while url:
        yield url
        nextTags = getElementsFromUrl(url, nextCssSelector)
        url = None

        for possibleNext in nextTags:
            if possibleNext.tag == 'a':
                href = possibleNext.get('href')
                # Absolute href
                url = urlparse.urljoin(startUrl, href)
                break
            else:
                newTag = possibleNext.find('a')
                if newTag != None:
                    href = newTag.get('href')
                    url = urlparse.urljoin(startUrl, href)
                    break
Exemplo n.º 34
0
def crawl_courses_from_program_page_url(url, program_code):
    soup = retrieve_soup(url)
    linked_urls = [
        urlparse.urljoin(BASE_URL, a["href"])
        for a in soup.find_all("a", href=True)
    ]
    course_urls = sorted(set(url for url in linked_urls
                             if _is_course_url(url)))
    return filter(None, [
        _crawl_course_data(course_url, program_code)
        for course_url in course_urls
    ])
Exemplo n.º 35
0
def urlIterator(startUrl, nextCssSelector):
    '''Yields the url of a page while there is a next one found by the cssSelector'''
    #This function takes time because it has to parse the dom to get the next url
    url = startUrl
    while url:
        yield url
        nextTags = getElementsFromUrl(url, nextCssSelector)
        url = None

        for possibleNext in nextTags:
            if possibleNext.tag == 'a':
                href = possibleNext.get('href')
                # Absolute href
                url = urlparse.urljoin(startUrl, href)
                break
            else:
                newTag = possibleNext.find('a')
                if newTag != None:
                    href = newTag.get('href')
                    url = urlparse.urljoin(startUrl, href)
                    break
Exemplo n.º 36
0
def domIterator(startUrl, nextCssSelector):
    dom = getDOM(startUrl)
    nextSelector = CSSSelector(nextCssSelector)
    while dom is not None:
        yield dom
        nextTags = nextSelector(dom)
        dom = None
        for possibleNext in nextTags:
            if possibleNext.tag == 'a':
                url = possibleNext.get('href')
                url = urlparse.urljoin(startUrl, url)
                dom = getDOM(url)
                break
Exemplo n.º 37
0
def domIterator(startUrl, nextCssSelector):
    dom = getDOM(startUrl)
    nextSelector = CSSSelector(nextCssSelector)
    while dom is not None:
        yield dom
        nextTags = nextSelector(dom)
        dom = None
        for possibleNext in nextTags:
            if possibleNext.tag == 'a':
                url = possibleNext.get('href')
                url = urlparse.urljoin(startUrl, url)
                dom = getDOM(url)
                break
Exemplo n.º 38
0
    def submit(self, opener, res):
        """follow login link on ESOE Chooser page

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        url = urlparse.urljoin(res.url, self.url)
        request = Request(url)
        log.debug("GET: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 39
0
def find_correct_element_url(params, el):
    els = el.xpath('//div[@class="wx-rb bg-blue wx-rb_v1 _item"]')
    print '---------------------------'
    print len(els)
    print '---------------------------'
    for cur_el in els:
        nick_name = cur_el.xpath('//div[@class="txt-box"]/h3/em/text()')[0]
        print nick_name
        if params.name == nick_name.encode('utf8'):
            url = cur_el.xpath('@href')[0]
            url = urlparse.urljoin(base_url, url)
            return url

    return ""
Exemplo n.º 40
0
def _get_program_urls_from_department_url(url):
    soup = retrieve_soup(url)
    linked_urls = [
        urlparse.urljoin(BASE_URL, a["href"])
        for a in soup.find_all("a", href=True)
    ]
    program_urls = set()
    for potential_program_url in linked_urls:
        if _is_course_url(potential_program_url):
            potential_program_url = ("/".join(
                potential_program_url.split("/")[:-1]))
        if _is_program_url(potential_program_url, url):
            program_urls.add(potential_program_url)
    return program_urls
Exemplo n.º 41
0
    def __init__(self, service_url, infrastructure_account, verify=True, oauth2=False):

        if not service_url:
            raise ConfigurationError('EntitiesWrapper improperly configured. URL is missing!')

        self.infrastructure_account = infrastructure_account
        self.__service_url = urlparse.urljoin(service_url, 'api/v1/')
        self.__session = requests.Session()

        self.__session.headers.update({'User-Agent': get_user_agent()})
        self.__session.verify = verify

        if oauth2:
            self.__session.headers.update({'Authorization': 'Bearer {}'.format(tokens.get('uid'))})
Exemplo n.º 42
0
def main(url):
    # Example URL: http://picturesofshit.com/v/2009/10-15_-_Dudescademy/
    img_size_qry_string = '?g2_imageViewsIndex=1'

    # Go to gallery and grab links to high resolution photos
    gallery = urlopen(url)
    soup = BeautifulSoup(gallery.read())
    links = [tag.attrMap['href'] + img_size_qry_string for tag in soup.findAll(href=re.compile('JPG.html'))]

    # Create download directory based on url
    dir = re.search('[_+]([a-zA-Z0-9]+)/$', url).groups()[0]
    if not os.path.exists(dir): os.makedirs(dir)

    # Go to each link, grab the image source, and download
    links = [urlparse.urljoin(url, link) for link in links]
    for link in links:
        gallery_image = urlopen(link)
        soup = BeautifulSoup(gallery_image.read())
        image_url = urlparse.urljoin(url, soup.find('img', 'ImageFrame_none').attrMap['src'])
        file_name = re.search('([^/]+)$', image_url).groups()[0]
        file = os.path.join(dir, file_name)
        print 'Downloading %s' % file_name
        urlretrieve(image_url, file)
    print '--- Downloads Complete ---'
Exemplo n.º 43
0
    def submit(self, opener, res):
        """submit IdP form to SP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info("Submitting SAML Verification form")
        data = self.data
        url = urlparse.urljoin(res.url, data["form"]["action"])
        data = urllib.urlencode({"SAMLRequest": data["SAMLRequest"]["value"]})
        request = Request(url, data=data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 44
0
    def submit(self, opener, res):
        """submit IdP form to SP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info('Submitting IdP SAML form')
        data = self.data
        url = urlparse.urljoin(res.url, data['form']['action'])
        data = urllib.urlencode({'SAMLResponse': data['SAMLResponse']['value'],
                                    'RelayState': data['RelayState']['value']})
        request = Request(url, data=data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
Exemplo n.º 45
0
 def request(self):
     for ticker, i in self.tickers.items():
         if i:
             path = self.real_time_path.format(ticker.lower())
             req = self.sess.get(urlparse.urljoin(self.base_url, path))
             if req.ok:
                 try:
                     price = self.parse(req.text)
                     self.callback(json.dumps({ticker.upper(): price}))
                     yield {ticker: price}
                 except Exception as e:
                     logging.error(e)
                     del self.tickers[ticker]
             else:
                 logging.error(req.reason)
         else:
             del self.tickers[ticker]
Exemplo n.º 46
0
    def check_config():
        """
        """
        Config.check_params(['url','backend'])
        
        if Config.backend+".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "'+ Config.backend + '" does not exist')


        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc,'')
        print("Checking URL: " + check_url)
        req = Request(check_url)
        try:
            response = urlopen(req)
        except HTTPError, e:
            raise InvalidConfig('The server could not fulfill the request '
                               + str(e.msg) + '('+ str(e.code)+')')
Exemplo n.º 47
0
    def check_config():
        """
        """
        Config.check_params(['url', 'backend'])

        if Config.backend + ".py" not in Backend.get_all_backends():
            raise InvalidConfig('Backend "' + Config.backend +
                                '" does not exist')

        url = urlparse.urlparse(Config.url)
        check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
        print("Checking URL: " + check_url)
        req = Request(check_url)
        try:
            response = urlopen(req)
        except HTTPError, e:
            raise InvalidConfig('The server could not fulfill the request ' +
                                str(e.msg) + '(' + str(e.code) + ')')
Exemplo n.º 48
0
def dump():
   for fmt,fxt in MIME_TYPES.iteritems():
      dump_path = path.join(DUMP_DIR, path.basename(fmt))
      makedirs(dump_path)

      for url in [ urlparse.urljoin(BASE_URL, p) for p in URL_PATHS ]:
         logger.info("Request metadata in '%s' from\n  %s\n" % (fmt, url))

         req = Request(url)
         req.add_header('Accept', fmt)
         res = urlopen(req)
         fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fxt)
         fname = path.join(dump_path, fname)

         logger.info("Write metadata into file './%s'\n" % fname)

         with open(fname, 'w') as fout:
            fout.write(res.read())
def scrapOreily(indexUrl, outName):
    '''Generates an html page from the index located at indexUrl'''
    links = scraptools.getElementsFromUrl(url, '#bodyContent ol a:nth-child(1)')
    
    f = open(outName, 'w')
    
    f.write(head)
    
    f.write(getHTMLContent(indexUrl))
    
    for link in links:
        relativeLink = link.get('href')
        print relativeLink
        absoluteLink = urlparse.urljoin(url, relativeLink)
        
        f.write(getHTMLContent(absoluteLink))
        
    f.write('</body></html>')
    f.close()
Exemplo n.º 50
0
def get_user_json(user, profile):

    # try:
    #    user_package = User_packages.objects.get(user=user, status="A")
    #    package = user_package.package.price
    # except User_packages.DoesNotExist:
    #    package = 0

    user_investment = User_packages.objects.filter(user=user).annotate(
        investment=Sum('package__price')).values()
    investment = user_investment[0]['investment'] if user_investment else 0

    today = UTC.normalize(UTC.localize(datetime.datetime.utcnow()))
    pkg = get_package(user)
    pkg_dt = pkg.created_at.strftime("%D") if pkg else None
    return dict(
        id=user.id,
        avi_id=profile.user_auto_id,
        relationship=get_relationship(user),
        name="%s %s" % (user.first_name, user.last_name),
        # content="Total Transactional Volume: %s" % (tot_txn_vol(user)),
        sponsor_id=None
        if profile.sponser_id is None else profile.sponser_id.id,
        placement_id=None
        if profile.placement_id is None else profile.placement_id.id,
        placement_position=profile.placement_position,
        image=ICON,
        link=dict(
            href=urlparse.urljoin("https://www.avicrypto.us", "/network") +
            "#"),
        image_Name="inactive" if investment is 0 else "active",
        investment=investment,
        transaction=tot_txn_vol(user),
        binary=binary_txns(user, EPOCH_BEGIN, today),
        direct=direct_txns(user, EPOCH_BEGIN, today),
        roi=roi_txns(user, EPOCH_BEGIN, today),
        direct_left=direct_child(user, EPOCH_BEGIN, today, leg='l'),
        direct_right=direct_child(user, EPOCH_BEGIN, today, leg='r'),
        binary_left=binary_child(user, EPOCH_BEGIN, today, leg='l'),
        binary_right=binary_child(user, EPOCH_BEGIN, today, leg='r'),
        left_members_count=get_user_count(user, 'l'),
        right_members_count=get_user_count(user, 'r'),
        package_active_date=pkg_dt)
Exemplo n.º 51
0
    def notify(cls, alert, per_entity=False, include_alert=True, message='', repeat=0, **kwargs):
        url = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json'

        repeat = kwargs.get('repeat', 0)

        # Auth key!
        service_key = kwargs.get('service_key', cls._config.get('notifications.pagerduty.servicekey'))
        zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host'))

        if not service_key:
            raise NotificationError('Service key is required!')

        entity = alert.get('entity')
        is_alert = alert.get('is_alert')
        event_type = 'trigger' if is_alert else 'resolve'

        alert_id = alert['alert_def']['id']
        key = 'ZMON-{}'.format(alert_id) if not per_entity else 'ZMON-{}-{}'.format(alert_id, entity['id'])

        description = message if message else cls._get_subject(alert)

        message = {
            'service_key': service_key,
            'event_type': event_type,
            'incident_key': key,
            'description': description,
            'client': 'ZMON',
            'client_url': urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '',
            'details': json.dumps(alert, cls=JsonDataEncoder) if include_alert else '',
        }

        try:
            logger.info('Sending to %s %s', url, message)
            headers = {'User-Agent': get_user_agent(), 'Content-type': 'application/json'}

            r = requests.post(url, json=message, headers=headers, timeout=5)

            r.raise_for_status()
        except Exception as ex:
            logger.exception('Notifying Pagerduty failed %s', ex)

        return repeat
Exemplo n.º 52
0
    def __init__(self,
                 service_url,
                 infrastructure_account,
                 verify=True,
                 oauth2=False):

        if not service_url:
            raise ConfigurationError(
                'EntitiesWrapper improperly configured. URL is missing!')

        self.infrastructure_account = infrastructure_account
        self.__service_url = urlparse.urljoin(service_url, 'api/v1/')
        self.__session = requests.Session()

        self.__session.headers.update({'User-Agent': get_user_agent()})
        self.__session.verify = verify

        if oauth2:
            self.__session.headers.update(
                {'Authorization': 'Bearer {}'.format(tokens.get('uid'))})
Exemplo n.º 53
0
    def _request(self, endpoint, q, method='get'):
        try:
            url = urlparse.urljoin(self.__service_url, endpoint)

            request = getattr(self.__session, method.lower())

            if method.lower() == 'post':
                response = request(url, json=q)
            else:
                response = request(url, params={'query': json.dumps(q)})

            if response.ok:
                return response.json()
            else:
                raise CheckError(
                    'EntitiesWrapper query failed: {} with status {}:{}'.format(q, response.status_code, response.text))
        except requests.Timeout:
            raise HttpError('timeout', self.__service_url), None, sys.exc_info()[2]
        except requests.ConnectionError:
            raise HttpError('connection failed', self.__service_url), None, sys.exc_info()[2]
 def parse_multiple_store_product(self, response):
     print "parse_multiple_store_product", response.url
     sel = Selector(response)
     breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]')
     categories = [
         span for span in breadcrumb.xpath(
             ".//span[@itemprop='title']/text()").extract()[1:]
     ]
     print categories
     item = ShoppingdotcomItem()
     item["categories"] = categories
     item["product_name"] = sel.xpath(
         "//h1[@class='productTitle']/text()").extract()[0]
     item["image_urls"] = list(
         set(sel.xpath("//div[@class='imgBorder']//img/@src").extract()))
     item["product_urls"] = []
     item["stores"] = []
     item["prices"] = []
     for div in sel.xpath("//div[contains(@id,'offerItem-')]"):
         item["product_urls"].append(
             urlparse.urljoin(
                 response.url,
                 div.xpath(".//a[@class='visitBtn']/@href").extract()[0]))
         item["stores"].append(
             div.xpath(".//img[contains(@id,'DCTmerchLogo')]/@title").
             extract()[0])
         if div.xpath(".//span[contains(@class,'toSalePrice')]"):
             item["prices"].append(
                 re.findall(
                     "\S+\d+\.\d+",
                     div.xpath(
                         ".//span[contains(@class,'toSalePrice')]/text()").
                     extract()[0])[0])
         else:
             item["prices"].append(
                 re.findall(
                     "\S+\d+\.\d+",
                     div.xpath(".//span[contains(@id,'priceQA')]/text()").
                     extract()[0])[0])
     yield item
     pass
Exemplo n.º 55
0
def validate(amt, src_addr, addr, txn_id, coin="btc"):
    """Validates the given address and transaction of the given crypto payment type which can by anyone of BTC, ETH, XRP"""
    txn_res, addr_res = COIN[coin](src_addr, txn_id)
    assert coin in ("btc", "eth", "xrp")
    if coin == "btc":
        return is_valid_btc_paid(amt, addr, src_addr, txn_id, addr_res,
                                 txn_res)
    elif coin == "eth":
        txn_res = txn_res.json()
        addr_res = addr_res.json()
        if (txn_res.get("status", None) == "1"
                and addr_res.get("status", None) == "1"):
            uri = "api?module=account&action=txlist&address=%s&startblock=0&endblock=99999999&sort=asc&apikey=%s" % (
                src_addr, ETHER_KEY)
            res = requests.get(urlparse.urljoin(ETH_HOST, uri)).json()
            return is_valid_eth_paid(amt, src_addr, addr, txn_id, res)
    elif coin == "xrp":
        j_txn = txn_res.json()
        j_addr = addr_res.json()
        if j_addr['result'] == "success" and j_txn['result'] == "success":
            return is_valid_xrp_paid(amt, txn_id, src_addr, addr, j_txn)
    raise Exception("you should not be here")
Exemplo n.º 56
0
    def _request(self, endpoint, q, method='get'):
        try:
            url = urlparse.urljoin(self.__service_url, endpoint)

            request = getattr(self.__session, method.lower())

            if method.lower() == 'post':
                response = request(url, json=q)
            else:
                response = request(url, params={'query': json.dumps(q)})

            if response.ok:
                return response.json()
            else:
                raise CheckError(
                    'EntitiesWrapper query failed: {} with status {}:{}'.
                    format(q, response.status_code, response.text))
        except requests.Timeout:
            raise HttpError('timeout',
                            self.__service_url), None, sys.exc_info()[2]
        except requests.ConnectionError:
            raise HttpError('connection failed',
                            self.__service_url), None, sys.exc_info()[2]
Exemplo n.º 57
0
    def parse_book_0(self, response):
        sel = Selector(response)
        item = MetaItem()
        item['title'] = sel.xpath('//h1/text()').extract_first()
        item['category'] = sel.xpath('//span[contains(@itemprop,"category")]/text()').extract_first()
        item['author'] = sel.xpath('//span[contains(@itemprop,"author")]/text()').extract_first()
        item['desc'] = sel.xpath('//div[contains(@itemprop, "description")]/node()').extract()
        # find chapter
        el_chapter = sel.xpath('//li[contains(@itemprop, "itemListElement")]/node()')
        el_chapter.extract()
        array = []

        for index, s in enumerate(el_chapter):
            ch = dict()
            ch['num'] = index + 1
            content_url = urlparse.urljoin(self.base_domain, s.xpath('@href').extract()[0])
            ch['url'] = content_url
            ch['name'] = s.xpath('span/text()').extract_first()
            array.append(ch)
            yield Request(content_url, meta={'chapter': ch}, callback=self.parse_content_0, priority=PRIORITY_MID)

        item['chapter'] = array
        yield item
Exemplo n.º 58
0
class Crawl:
    f = open('employee_detail.html', 'w')
    f.truncate()
    f.close()
    seed = 'http://www.reuters.com/finance/markets/indices'
    all_links = set()
    links = list()

    try:

        r = requests.get(seed)
        if r.status_code == 200:
            print('Fetching in page links...')
            # print r.status_code
            content = r.content
            soup = BeautifulSoup(content, "lxml")
            tags = soup('a')
            flg = 0
            for a in tags:
                href = a.get("href")
                if href is not None:
                    new_url = urlparse.urljoin(seed, href)
                    if new_url.find("sector") != -1:
                        print new_url
                        links.append(
                            new_url)  # 'links' contains URLs of all 10 sectors

        elif r.status_code == 403:
            print "Error: 403 Forbidden url"
        elif r.status_code == 404:
            print "Error: 404 URL not found"
        else:
            print "Make sure you have everything correct."

    except requests.exceptions.ConnectionError, e:
        print "Oops! Connection Error. Try again"
Exemplo n.º 59
0
def bot_send_video(gesture,
                   video_url,
                   video_preview_img,
                   to_mid="u2ef38a8c1f3f1c2c63bdf9c0a629023c"):

    headers = {}
    headers['Content-type'] = 'application/json; charset=UTF-8'
    headers['X-Line-ChannelID'] = settings.CHANNEL_ID
    headers['X-Line-ChannelSecret'] = settings.CHANNEL_SECRET
    headers['X-Line-Trusted-User-With-ACL'] = settings.CHANNEL_MID

    api = 'https://trialbot-api.line.me/v1/events'

    body = {}
    body['to'] = [to_mid]
    body['toChannel'] = 1383378250
    body['eventType'] = "138311608800106203"

    #gesture = Gesture.objects.all()[0]
    myurl = 'https://eldertranslator.herokuapp.com/'
    video_url = urlparse.urljoin(myurl, gesture.video.url)

    content = {
        "contentType": 3,
        "toType": 1,
        "originalContentUrl": video_url,
        "previewImageUrl": video_preview_img
    }

    body['content'] = content
    req = requests.post(api,
                        data=json.dumps(body),
                        headers=headers,
                        verify=False)

    return req