Пример #1
0
    def parse_itemlisting_style(self):
        item_tds = self.soup.findAll('td', {'class' : ('itemlisting', 'itemlisting2')})
        for td in item_tds:
            tr = td.findPrevious('tr')
            item = models.Item()

            marker = tr.find(text=re.compile("Print the title"))
            title = marker.nextSibling.strip()
            title = util.unescape(title)
            item.title = util.stripNonAscii(title)

            marker = tr.find(text=re.compile("Print the author"))
            if marker is None or marker.nextSibling is None:
                author = ''
            else:
                author = marker.nextSibling.strip().strip('.')
            L = author.split(',')
            author = ','.join(L[0:2])
            author = util.unescape(author)
            item.author = util.stripNonAscii(author)

            marker = tr.find(text=re.compile("Print the date due"))
            #<td>Due <!--Print the date due--> <strong>12/10/2011,....
            dueDate = marker.parent.find('strong').string.strip()
            dueDate = dueDate.split(',')[0] #strip time
            item.dueDate = util.toDatetime(dueDate)
            self.itemsOut[item.title] = item
        print self.itemsOut
Пример #2
0
    def parse(self):
        duecomments = self.soup.findAll(text=re.compile("Due Date"))

        for comment in duecomments:
            tr = comment.findPrevious('tr')
            item = models.Item()

            marker = tr.find(text=re.compile("Title"))
            if marker is None:
                marker = tr.find(text=re.compile("Print the title"))
            title = self.findcontent(marker.parent)
            title = util.unescape(title)
            item.title = util.stripNonAscii(title)

            marker = tr.find(text=re.compile("Author"))
            author = self.findcontent(marker.parent)
            L = author.split(',')
            author = ','.join(L[0:2])
            author = util.unescape(author)
            item.author = util.stripNonAscii(author)

            marker = tr.find(text=re.compile("Due Date"))
            dueDate = self.findcontent(marker.parent)
            dueDate = dueDate.split(',')[0] #strip time
            item.dueDate = util.toDatetime(dueDate)
            self.itemsOut[item.title] = item
Пример #3
0
 def from_text(cls, text):
     match = cls.token_re.match(text)
     if not match:
         raise ParseTokenError('cannot parse Token from {}'.format(text))
     groups = match.groupdict()
     return cls(
         unescape(groups['word']),
         unescape(groups['lemma']),
         unescape(groups['pos'])
     )
Пример #4
0
    def from_text(cls, text):
        match = cls.arg_re.match(text)
        if not match:
            raise ParseTokenError('cannot parse Argument from {}'.format(text))
        groups = match.groupdict()

        return cls(unescape(groups['word']), unescape(groups['lemma']),
                   unescape(groups['pos']),
                   groups['ner'] if groups['ner'] != 'NONE' else '',
                   int(groups['entity_idx']) if groups['entity_idx'] else -1,
                   int(groups['mention_idx']) if groups['mention_idx'] else -1)
Пример #5
0
 def from_text(cls, text):
     match = cls.pred_re.match(text)
     if not match:
         raise ParseTokenError(
             'cannot parse Predicate from {}'.format(text))
     groups = match.groupdict()
     return cls(
         unescape(groups['word']), unescape(groups['lemma']),
         unescape(groups['pos']),
         True if groups['neg'] is not None else False,
         unescape(groups['prt']) if groups['prt'] is not None else '')
Пример #6
0
    def _displayEntry(self, index):
        entry = self.container.items[index-1]
        urls =  util.find_urls(entry.content)
        title = util.unescape(entry.title).replace("\n", ' ').encode('utf-8')
        content = util.strip_tags(util.unescape(entry.content)).encode('utf-8')

        print title
        print content

        #uniqify the urls
        for i in list(set(urls)):
            print ''.join(i)
Пример #7
0
    def ud(self):
        if not self.values:
            self.chat("Whatchu wanna know, bitch?")
            return

        try:
            request = pageopen('http://www.urbandictionary.com/define.php',
                               params={'term': ' '.join(self.values)})
            soup = bs4(request.text)
        except:
            self.chat("parse error")
            return

        elem = soup.find('div', {'class': 'meaning'})

        try:
            defn = []
            for string in elem.stripped_strings:
                defn.append(string)
        except:
            self.chat("couldn't find anything")


        if defn:
            # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML
            # entities like &#x27; so use the parser for any stray entities.
            for paragraph in defn:
                wrapped = textwrap.wrap(paragraph, 200)
                for line in wrapped:
                    self.chat(unescape(line))
        else:
            self.chat("couldn't find anything")
Пример #8
0
def loadDoi(filename):
    """Load <batch>_doi.xml"""
    #refs = load_bib_blocks(file)


    #Lets load the DOI.  First we assume unixref
    full_path = util.sanitizeXML(filename)
    doc = minidom.parse(full_path)
    if doc.hasChildNodes():
        if doc.childNodes[0].nodeName == "doi_records":
            keys = doc.getElementsByTagName('doi_record')
        if doc.childNodes[0].nodeName == "crossref_result":
            keys = doc.getElementsByTagName('query')
    else:
        keys = []
        print "Invalid result file ... ignoring %s" % filename


    #build a dictionary of the keys that have doi
    doi_keys = {}
    for key in keys:
        try:
            refkey = key.getAttribute("key")
            refdoi = key.getElementsByTagName("doi")
            if refdoi:
                newdoi = refdoi[0].childNodes[0].nodeValue.strip()
                doi_keys[refkey] = util.unescape(newdoi)
        except AttributeError:
            continue
    return doi_keys
Пример #9
0
def code(bot, msg, language, _, code):
    """
    Run arbitrary code of the specified language.

    Usage:
      @you: @bot python `print [x ** 2 for x in xrange(10) if x % 2]`
      @bot: [1, 9, 25, 49, 81]

    Valid languages include python, py3, ruby, coffeescript, gcc (C),
    and php.
    """
    uri = 'https://eval.in/'
    data = {
        "utf8": "\xce\xbb",
        "execute": "on",
        "private": "on",
        "lang": supported_languages[language],
        "input": "",
        "code": util.flatten_incoming_text(bot, code).encode('utf-8'),
    }
    response = requests.post(uri, data)
    bot.debug(response.url)
    _, html = response.content.split("<h2>Program Output</h2>", 1)
    html = html.lstrip()
    html = html[5: html.index("</pre>")]
    output = util.unescape(html).rstrip().decode('utf-8')
    if output:
        try:
            bot.reply(u"```{}```".format(output))
        except exception.MessageTooLongException:
            bot.reply(response.url)
    else:
        bot.reply("No output...")
Пример #10
0
    def parse(self):
        # look for pending fine
        fine = self.soup.find('div', {'id':'panelVerifyCharges'})
        if fine != None:
            raise PendingFineException

        row = self.soup.find('div', {'id':'panelMessage'})
        titles = row.findAll('i')

        for title in titles:
            item = models.Item()

            reason = title.nextSibling.strip()
            if reason == 'is renewed.':
                item.renewed = True
                item.renewalError = None
            else:
                item.renewed = False
                error_ul = title.findNextSibling('ul')
                if error_ul == None:
                    item.renewalError = 'Renewal failed'
                else:
                    item.renewalError = error_ul.li.string

            titlestr = title.contents[0].strip()
            titlestr = util.unescape(titlestr)
            titlestr = util.stripNonAscii(titlestr)
            self.renewalItems[titlestr] = item
Пример #11
0
	def parse(self):
		self.form = self.soup.find("form", {"name" : "hasnow"})
		row = self.soup.find('input', {'name' : 'HASNOW'})
		if row == None:
			return
		
		table = row.findPrevious('table')
		#print table.__class__.__name__

		#print table.prettify()
		rows = table.findAll('tr')
		#print len(rows)
		for itemrow in rows:
			#print row.__class__.__name__

			#print row.prettify()
			# ignore the header row -- we know it's a header if there isn't a renewal checkbox next to it
			if itemrow.find('input', {'name':'HASNOW'}) == row.Null:
				continue
			item = models.Item()
			#print row.prettify()
			renewitemkeys = itemrow.find('input', {'name':'HASNOW'})
			
			divs = itemrow.findAll('div', {'id' : 'globaltext'})
			#print len(divs)
			title = divs[0].string.strip()
			title = util.unescape(title)
			item.title = util.stripNonAscii(title)
			#print title
			dueDate = divs[4].string.strip()
			dueDate = dueDate.split(',')[0] #strip time
			item.dueDate = util.toDatetime(dueDate)
			self.itemsOut[item.title] = item
Пример #12
0
 def parse_title(self, td, item):
     link = td.find('a')
     title = util.unescape(link.text.strip(' :/.'))
     item.title = util.stripNonAscii(title)
     span = td.find('span')
     if span is not None and span.text is not None:
         item.author = span.text.strip(' :/.')
     return item
Пример #13
0
    def parseTitle(self, td, item):
        links = td.findAll("a", {"class": lambda (x): x != "boldRedFont1"})
        # for some reason many title links have a superfluous ' /' at the end -- remove this
        title = links[0].string.rstrip(" /")
        title = util.unescape(title)
        item.title = util.stripNonAscii(title)

        author = links[1].string
        author = author.rstrip(".")
        if author.startswith("by "):
            author = author.replace("by ", "", 1)
        # sometimes there is extraneous information after the author's name, ex: Dylan, Bob, 1941-
        L = author.split(",")
        author = ",".join(L[0:2])
        author = util.unescape(author)
        item.author = util.stripNonAscii(author)

        return item
Пример #14
0
 def parseTimesRenewed(self, td, item):
     links = td.findAll("a", {"class": lambda (x): x != "boldRedFont1"})
     # some horizon sites leave timesrenewed column blank instead of 0
     timesRenewed = links[0].string.strip()
     timesRenewed = util.unescape(timesRenewed)
     try:
         item.timesRenewed = int(timesRenewed)
     except ValueError:
         item.timesRenewed = 0
     return item
Пример #15
0
 def parseTitle(self, td, item):
     span = td.find('span')
     link = span.find('a')
     if link == None:
         title = span.contents[0].strip()
     else:
         title = link.contents[0].strip()
     title = util.unescape(title)
     item.title = util.stripNonAscii(title)
     return item
Пример #16
0
    def linker(self, urls):
        for url in urls:
            # Special behaviour for Twitter URLs
            match_twitter_urls = re.compile("http[s]?://(www.)?twitter.com/.+/status/([0-9]+)")

            twitter_urls = match_twitter_urls.findall(url)
            if len(twitter_urls):
                self.tweet(twitter_urls)
                return

            fubs = 0
            title = "Couldn't get title"
            roasted = "Couldn't roast"

            urlbase = pageopen(url)
            if not urlbase:
                fubs += 1

            try:
                opener = urllib2.build_opener()
                roasted = opener.open(SHORTENER + url).read()
            except:
                fubs += 1

            ext = url.split(".")[-1]
            images = ["gif", "png", "jpg", "jpeg"]

            if ext in images:
                title = "Image"
            elif ext == "pdf":
                title = "PDF Document"
            else:
                try:
                    cont = soup(urlbase, convertEntities=soup.HTML_ENTITIES)
                    title = cont.title.string
                except:
                    self.chat("Page parsing error")
                    return

            deli = "https://api.del.icio.us/v1/posts/add?"
            data = urllib.urlencode({"url": url, "description": title, "tags": "okdrink," + self.lastsender})

            if DELICIOUS_USER:
                base64string = base64.encodestring("%s:%s" % (DELICIOUS_USER, DELICIOUS_PASS))[:-1]
                try:
                    req = urllib2.Request(deli, data)
                    req.add_header("Authorization", "Basic %s" % base64string)
                    send = urllib2.urlopen(req)
                except:
                    self.chat("(delicious is down)")

            if fubs == 2:
                self.chat("Total fail")
            else:
                self.chat(unescape(title) + " @ " + roasted)
Пример #17
0
def _get_categorys():

    error, categorys = _exc_sql( 'select distinct `category` from `findnsave_sale_t` limit 500' )
    categorys = categorys or []
    for d in categorys:
        for k in d:
            d[ k ] = unescape( d[ k ] )
    if categorys:
        categorys.sort( key = lambda x:x['category'] )

    return error, categorys
Пример #18
0
def _get_categorys():

    error, categorys = _exc_sql(
        'select distinct `category` from `findnsave_sale_t` limit 500')
    categorys = categorys or []
    for d in categorys:
        for k in d:
            d[k] = unescape(d[k])
    if categorys:
        categorys.sort(key=lambda x: x['category'])

    return error, categorys
Пример #19
0
 def get_rail_videos(self, **kwargs):
     video_count = last_count = 0
     videos = util.struct()
     videos.list = []
     videos.next = 1
     while video_count < int(self.plugin.get_setting('page_size') or 15):
         data = requests.get(RAIL_URL % kwargs).text
         # match video 'rail's
         # match: (title, video_id, date [DD/MM/AAAA],
         #         thumb, duration [MM:SS], plot)
         regExp = (
             '<li.*data-video-title="(.+?)"[\s]+data-video-id="(.+?)"[\s]+'
             + 'data-video-data-exibicao="(.+?)">[\s\S]+?'
             + '<img.+src="(.+?)"[\s\S]+?'
             + '<span class="duracao.*?">(.+?)</span>[\s\S]+?'
             + 'div class="balao">[\s]+?<p>[\s]+?([\w].+?)[\s]+?</p>'
         )
         matches = re.compile(regExp).findall(data)
         mcount = len(matches)
         properties = ('title', 'id', 'date', 'thumb', 'duration', 'plot')
         for item in matches:
             video = util.struct(dict(zip(properties, item)))
             # update attrs
             video.title = util.unescape(video.title)
             video.plot = util.unescape(video.plot)
             video.date = video.date.replace('/', '.')
             _split = video.duration.split(':')
             video.duration = sum(int(x) * 60 ** i for i, x in
                                  enumerate(reversed(_split)))
             self.cache.set('video|%s' % video.id, repr(video))
             videos.list.append(video)
         if mcount == 0 or mcount < last_count:
             videos.next = None
             break
         video_count += mcount
         last_count = mcount
         kwargs['page'] += 1
     if videos.next:
         videos.next = kwargs['page']
     return videos
Пример #20
0
 def get_rail_videos(self, **kwargs):
     video_count = last_count = 0
     videos = util.struct()
     videos.list = []
     videos.next = 1
     while video_count < int(self.plugin.get_setting('page_size') or 15):
         data = requests.get(RAIL_URL % kwargs).text
         # match video 'rail's
         # match: (title, video_id, date [DD/MM/AAAA],
         #         thumb, duration [MM:SS], plot)
         regExp = (
             r'<li.*data-video-title="(.+?)"[\s]+data-video-id="(.+?)"[\s]+'
             + r'data-video-data-exibicao="(.+?)">[\s\S]+?'
             + r'<img.+src="(.+?)"[\s\S]+?'
             + r'<span class="duracao.*?">(.+?)</span>[\s\S]+?'
             + r'div class="balao">[\s]+?<p>[\s]+?([\w].+?)[\s]+?</p>'
         )
         matches = re.compile(regExp).findall(data)
         mcount = len(matches)
         properties = ('title', 'id', 'date', 'thumb', 'duration', 'plot')
         for item in matches:
             video = util.struct(dict(zip(properties, item)))
             # update attrs
             video.title = util.unescape(video.title)
             video.plot = util.unescape(video.plot)
             video.date = video.date.replace('/', '.')
             _split = video.duration.split(':')
             video.duration = sum(int(x) * 60 ** i for i, x in
                                  enumerate(reversed(_split)))
             self.cache.set('video|%s' % video.id, repr(video))
             videos.list.append(video)
         if mcount == 0 or mcount < last_count:
             videos.next = None
             break
         video_count += mcount
         last_count = mcount
         kwargs['page'] += 1
     if videos.next:
         videos.next = kwargs['page']
     return videos
Пример #21
0
    def parse(self):
        self.form = self.soup.find('form', {'name' : 'renewitems'})
        checkboxes = self.form.findAll('input', {'type' : 'checkbox'})
        for checkbox in checkboxes:
            item = models.Item()
            item.renewitemkey = checkbox['name']

            title_label = checkbox.findNext('td').label
            title = title_label.contents[2].strip()
            title = util.unescape(title)
            item.title = util.stripNonAscii(title)

            self.renewalitems[item.title] = item
Пример #22
0
def _get_findnsave_data( table, cols ):
    error = None

    sql = "select %s from `%s`" % ( ', '.join( [ '`%s`' % c for c in cols ] ), table )
    logger.info( sql )

    error, data = _exc_sql( sql )
    data = data or []

    for d in data:
        for k in d:
            d[ k ] = unescape( d[ k ] )

    return error, data
Пример #23
0
def _get_findnsave_data(table, cols):
    error = None

    sql = "select %s from `%s`" % (', '.join(['`%s`' % c
                                              for c in cols]), table)
    logger.info(sql)

    error, data = _exc_sql(sql)
    data = data or []

    for d in data:
        for k in d:
            d[k] = unescape(d[k])

    return error, data
Пример #24
0
    def _displayFeeds(self, forceRefresh=False):
        if forceRefresh:
            self.container.clearItems()
            self.container.loadItems()

        index = 1
        for item in self.container.getItems():
            title = util.unescape(item.title).replace("\n", ' ').encode('utf-8')
            if len(title) > 80:
                title = title[0:77] + '...'
            author = item.author or ''
            author = author.encode('utf-8')

            if item.isUnread():
                print "%2s: %s [%s]" % (index, title, author)
            index += 1
Пример #25
0
    def urlparse(self, url):
        if self.cleanse(url) == False:
            return [url]

        fubs = 0
        title = "Couldn't get title"

        site = Browser(url)

        if site.error:
            self.chat('Total fail: %s' % site.error)
            return [url]

        roasted = shorten(url)
        if not roasted:
            roasted = "Couldn't roast"
            fubs += 1

        self.chat('%s @ %s' % (unescape(site.title()), roasted))
        return [url]
Пример #26
0
Файл: tty.py Проект: aseba/TNT
 def do_directmessage(self, **kwargs):
     if not (kwargs.has_key("params")):
         for dm in reversed(self.tnt.getDirectMessages()):
             print_string = u"%(threading_color)s%(threading)s%(reset_color)s%(datetime)s %(id_color)s%(id)d : %(name)s%(nick_color)s%(username)s %(text_color)s: %(text)s%(reset_color)s"
             tweet_out_data = {"reset_color": COLOR_WHITE}
             tweet_out_data["username"] = u"@" + dm.sender_screen_name
             tweet_out_data["id"] = self.tnt.getIdFor(pos)
             tweet_out_data["text"] = util.unescape(self.tnt.getTextFor(pos))
             tweet_out_data["datetime"] = unicode(
                 datetime.datetime.fromtimestamp(self.tnt.getTimeFor(pos)).strftime("%H:%M:%S")
             )
             tweet_out_data["id_color"] = COLOR_CYAN
             tweet_out_data["text_color"] = COLOR_YELLOW
             toprint = dm.sender_screen_name + u": " + dm.text
             print(toprint.encode("utf-8"))
     elif len(kwargs["params"].split(" ")) >= 2:
         user = kwargs["params"].split(" ")[0]
         message = " ".join(kwargs["params"].split(" ")[1:])
         self.tnt.sendDirectMessage(user, message)
     else:
         print(u"──> DM: wrong param set".encode("utf-8"))
Пример #27
0
Файл: tty.py Проект: aseba/TNT
 def printStatus(self, pos):
     print_string = u"%(reset_color)s%(datetime)s %(id_color)s%(id)d :%(threading_color)s%(threading)s %(name)s%(nick_color)s%(username)s : %(text_color)s%(text)s%(reset_color)s"
     tweet_out_data = {"reset_color": COLOR_WHITE}
     # First, the info
     if self.config["showFullName"]:
         tweet_out_data["name"] = self.tnt.getAuthorNameFor(pos) + " "
     else:
         tweet_out_data["name"] = u""
     tweet_out_data["username"] = u"@" + self.tnt.getAuthorScreenNameFor(pos)
     tweet_out_data["id"] = self.tnt.getIdFor(pos)
     tweet_out_data["text"] = util.unescape(self.tnt.getTextFor(pos))
     tweet_out_data["datetime"] = unicode(
         datetime.datetime.fromtimestamp(self.tnt.getTimeFor(pos)).strftime("%H:%M:%S")
     )
     order = self.tnt.getThreadPositionOf(self.tnt.getIdFor(pos))
     if order > 0:
         tweet_out_data["threading"] = u" └─" + u"─" * (order - 1) + u"> "
     else:
         tweet_out_data["threading"] = ""
     # we set colors
     # if the tweet speaks about the user
     tweet_out_data["threading_color"] = COLOR_RED
     tweet_out_data["nick_color"] = COLOR_RED
     if self.tnt.getTextFor(pos).find(self.tnt.getUser().GetScreenName()) > -1:
         tweet_out_data["id_color"] = COLOR_CYAN
         tweet_out_data["text_color"] = COLOR_YELLOW
     # if the tweet's author is the user or the author is in the hilight list
     elif self.tnt.getAuthorScreenNameFor(pos).find(self.tnt.getUser().GetScreenName()) > -1:
         tweet_out_data["id_color"] = COLOR_CYAN
         tweet_out_data["text_color"] = COLOR_GREEN
     elif "@" + self.tnt.getAuthorScreenNameFor(pos) in self.config["hilight"]:
         tweet_out_data["id_color"] = COLOR_MAGENTA
         tweet_out_data["text_color"] = COLOR_MAGENTA
     # if it's a normal tweet
     else:
         tweet_out_data["id_color"] = COLOR_GREEN
         tweet_out_data["text_color"] = COLOR_WHITE
     # now we print it
     final_print_string = print_string % tweet_out_data
     print(final_print_string.encode("utf-8"))
Пример #28
0
    def parse(self):
        #print self.soup.prettify()
        dds = self.soup.findAll('dd')

        for dd in dds:
            item = models.Item()

            reasonSoup = dd.findPrevious('strong')
            print reasonSoup.prettify()
            reason = util.inner_text(reasonSoup)
            print "reason=" + reason
            if reason == 'Item renewed':
                item.renewed = True
                item.renewalError = None
            else:
                item.renewed = False
                item.renewalError = reason

            title = dd.contents[0].strip()
            title = util.unescape(title)
            title = util.stripNonAscii(title)
            self.renewalItems[title] = item
Пример #29
0
    def ud(self):
        if not self.values:
            return "Whatchu wanna know, bitch?"

        term = ' '.join(self.values)
        term = term.strip()

        if term == 'truffle butter':
            return "You all know what it is, and I don't want to have to read this shit again."

        try:
            request = Browser('http://www.urbandictionary.com/define.php',
                              params={'term': term})
            soup = request.soup()
        except:
            return "parse error"

        elem = soup.find('div', {'class': 'meaning'})

        try:
            defn = []
            for string in elem.stripped_strings:
                defn.append(string)
        except:
            return "couldn't find anything"

        if not defn:
            return "couldn't find anything"

        # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML
        # entities like &#x27; so use the parser for any stray entities.

        response = []
        for paragraph in defn:
            wrapped = textwrap.wrap(paragraph, 200)
            _response = unescape(' '.join(wrapped))
            response.append(_response)

        return ' '.join(response)
Пример #30
0
    def ud(self):
        if not self.values:
            return "Whatchu wanna know, bitch?"

        term = ' '.join(self.values)
        term = term.strip()

        if term == 'truffle butter':
            return "You all know what it is, and I don't want to have to read this shit again."

        try:
            request = Browser('http://www.urbandictionary.com/define.php',
                               params={'term': term})
            soup = request.soup()
        except:
            return "parse error"

        elem = soup.find('div', {'class': 'meaning'})

        try:
            defn = []
            for string in elem.stripped_strings:
                defn.append(string)
        except:
            return "couldn't find anything"

        if not defn:
            return "couldn't find anything"

        # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML
        # entities like &#x27; so use the parser for any stray entities.

        response = []
        for paragraph in defn:
            wrapped = textwrap.wrap(paragraph, 200)
            _response = unescape(' '.join(wrapped))
            response.append(_response)

        return ' '.join(response)
Пример #31
0
def getMyVideos(session):
    result = []
    content = session.get(youtubeUrl + 'my_videos' + '?' +
                          urllib.urlencode({'o': 'U'})).text
    dummy, i = util.substr('"VIDEO_LIST_DISPLAY_OBJECT"', ':', content)
    data = json.loads(util.parseBrackets(content, i, ['[', ']']))
    for item in data:
        soup = BeautifulSoup(
            util.unescape(item['html'].decode('unicode_escape')),
            "html.parser")
        ptag = soup.find(class_="vm-video-indicators")
        privacy = 'Public'
        if not ptag.find(class_='vm-unlisted').parent.has_attr('aria-hidden'):
            privacy = 'Private'
        if not ptag.find(class_='vm-private').parent.has_attr('aria-hidden'):
            privacy = 'Private'
        try:
            duration = util.timeStrToSeconds(
                soup.find(class_="video-time").get_text())
        except:
            duration = ''
        result.append({
            'id':
            item['id'],
            'name':
            soup.find(class_="vm-video-title-content").get_text(),
            'thumb':
            videoImage(item['id']),
            'duration':
            duration,
            'privacy':
            privacy,
            'user':
            '******'
        })
    return (result)
Пример #32
0
    def linker(self, urls):
        for url in urls:
            # Special behaviour for Twitter URLs
            match_twitter_urls = re.compile('http[s]?://(www.)?twitter.com/.+/status/([0-9]+)')

            twitter_urls = match_twitter_urls.findall(url)
            if len(twitter_urls):
                self.tweet(twitter_urls)
                return

            if url.find('gist.github') != -1:
                return

            if randint(1, 5) == 1:
                try:
                    self.commands.get('tweet', self.default)(url)
                except:
                    pass

            fubs = 0
            title = "Couldn't get title"

            site = Browse(url)

            if site.error:
                self.chat('Total fail: %s' % site.error)
                continue

            roasted = shorten(url)
            if not roasted:
                roasted = "Couldn't roast"
                fubs += 1

            try:
                ext = site.headers()['content-type'].split('/')[1]
            except:
                ext = False

            images = [
                'gif',
                'png',
                'jpg',
                'jpeg',
            ]

            if ext in images:
                title = 'Image'
                # Switch this to a Browse method
                if STORE_IMGS:
                    fname = url.split('/').pop()
                    path = IMGS + fname
                    self.butler.do(savefromweb, (url, path, self.lastsender), 'Thumb @ %s')

            elif ext == 'pdf':
                title = 'PDF Document'

            else:
                title = site.title()

            # If you have a delicious account set up. Yes, delicious
            # still exists. Could be updated to a cooler link
            # collecting service.
            if STORE_URLS:
                postdelicious(url, title, self.lastsender)

            if fubs == 2:
                self.chat("Total fail")
            else:
                self.chat("%s @ %s" % (unescape(title), roasted))
Пример #33
0
def findnsave_sale():
    error = None

    #err, areas = _get_findnsave_data( 'findnsave_area', ('area',) )
    areas = [{'area': 'newyork'}]
    #err, stores = _get_findnsave_data( 'findnsave_store', ('name',) )
    stores = [
        {
            'name': 'Walmart'
        },
        {
            'name': 'Target'
        },
        {
            'name': 'Toys"R"Us'
        },
    ]
    err, brands = _get_findnsave_data('findnsave_brand', ('name', ))
    err, categorys = _get_categorys()

    if request.method == 'POST':
        keywords = request.form['keywords'].strip()
        area = request.form['select_area']
        store = request.form['select_store']
        brand = request.form['select_brand']
        category = request.form['select_category']
        num = request.form['number']
        action = request.form['action']

        where = []
        if area != 'All':
            where.append("`area`=%s" % esql.escape_string(area))
        if store != 'All':
            where.append("`retailer`=%s" % esql.escape_string(store))
        if brand != 'All':
            where.append("`brand`=%s" % esql.escape_string(brand))
        if category != 'All':
            where.append("`category`=%s" % esql.escape_string(category))

        keywords = keywords.split()
        for kw in keywords:
            where.append("`name` like %s" % esql.escape_string('%' + kw + '%'))

        if where:
            where = 'where ' + ' and '.join(where)
        else:
            where = ''

        cols = (
            '_ID',
            'area',
            'name',
            'retailer',
            'brand',
            'category',
            'price',
            'priceRegular',
            'priceOff',
            'priceUtilDate',
        )

        sql = "select * from `%s` %s" % (
            'findnsave_sale_t',
            where,
        )
        if num != 'total':
            sql += ' limit ' + num
        logger.info(sql)
        try:
            db = esql.Database(conf.dbconf)
            data = db.conn.read(sql)
        except Exception as e:
            logger.exception(repr(e))
            error = repr(e)
            data = []

        for d in data:
            for k in d:
                d[k] = unescape(d[k])
                if d[k] == '':
                    d[k] = 'not specified'

        if action == 'export':
            name = ','.join([area, store, brand, num])
            return _findnsave_sale_download(name, data, cols + ('desc', ))

        return render_template('findnsave_show_sales.html',
                               error=error,
                               areas=areas,
                               stores=stores,
                               brands=brands,
                               categorys=categorys,
                               sales=data,
                               cols=cols)

    return render_template('findnsave_show_sales.html',
                           error=error,
                           areas=areas,
                           stores=stores,
                           brands=brands,
                           categorys=categorys,
                           sales=[],
                           cols=[])
Пример #34
0
 def render(self):
     self.body_html = _get_markup_formatter()(self.body)
     # Remove tags which was generated with the markup processor
     text = strip_tags(self.body_html)
     # Unescape entities which was generated with the markup processor
     self.body_text = unescape(text)
Пример #35
0
res_data["food"] = 0
res_data["decor"] = 0
res_data["service"] = 0
res_data["cost"] = 0

res_data_list=[]

e2 = soup.find_all("div",class_="case js-case")
# print e2[0]
# print e2[0].find("div",class_="image")
for elem in e2:
    t = elem.find("div",class_="text")
    res_name = t.find("div",class_="text-cnt Restaurants").a.text


    res_cui = unescape(t.find("div",class_="text-cnt Restaurants").p.text)
    # str.decode("utf-8").replace(res_cui, "@")
    special = u"\u2022"
    res_cui = res_cui.replace(special,'@')
    res_cui = parseres_name(res_cui)

    stats = t.select(".text-stats")
    res_data["name"] = res_name
    res_data["cuisine"] = res_cui
    res_data["food"] = convert(t.select(".i-number.i-number-red")[0].text)
    res_data["decor"] = convert(t.select(".i-number")[0].text)
    res_data["service"] = convert(t.select(".i-number")[1].text)
    res_data["cost"] = convert(t.select(".i-number")[2].text)
    print res_data
    newd = res_data.copy()
    res_data_list.append(newd)
Пример #36
0
    def real_filename_complete(self, text, line, begidx, endidx):
        """Figure out what filenames match the completion."""

        # line contains the full command line that's been entered so far.
        # text contains the portion of the line that readline is trying to complete
        # text should correspond to line[begidx:endidx]
        #
        # The way the completer works text will start after one of the characters
        # in DELIMS. So if the filename entered so far was "embedded\ sp" and
        # then text will point to the s in sp.
        #
        # The following bit of logic backs up to find the real beginning of the
        # filename.

        for before_match in range(begidx, 0, -1):
            if line[before_match] in self.DELIMS and before_match >= 1 and line[
                    before_match - 1] != '\\':
                break

        # We set fixed to be the portion of the filename which is before text
        # and match is the full portion of the filename that's been entered so
        # far (that's that part we use for matching files).
        #
        # When we return a list of completions, the bit that we return should
        # just be the portion that we replace 'text' with.

        # fixed portion of the match
        fixed = unescape(line[before_match + 1:begidx])
        # portion to match filenames against
        match = unescape(line[before_match + 1:endidx])

        # We do the following to cover the case that the current directory
        # is / and the path being entered is relative.
        if match[0] == '/':
            abs_match = match
        elif self.cur_dir == '/':
            abs_match = self.cur_dir + match
        else:
            abs_match = self.cur_dir + '/' + match

        completions = []
        prepend = ''
        if abs_match.rfind('/') == 0:  # match is in the root directory
            # This means that we're looking for matches in the root directory
            # (i.e. abs_match is /foo and the user hit TAB).
            # So we'll supply the matching board names as possible completions.
            # Since they're all treated as directories we leave the trailing slash.
            if match[0] == '/':
                completions += [
                    dev.name_path for dev in self.boards.boards()
                    if dev.name_path.startswith(abs_match)
                ]
            else:
                completions += [
                    dev.name_path[1:] for dev in self.boards.boards()
                    if dev.name_path.startswith(abs_match)
                ]
            try:
                # Add root directories of the default device
                def_dev = self.boards.default
                if match[0] == '/':
                    completions += [
                        root_dir for root_dir in def_dev.root_dirs
                        if root_dir.startswith(match)
                    ]
                else:
                    completions += [
                        root_dir[1:] for root_dir in def_dev.root_dirs
                        if root_dir[1:].startswith(match)
                    ]
            except BoardError:
                pass
        else:
            # This means that there are at least 2 slashes in abs_match. If one
            # of them matches a board name then we need to remove the board
            # name from fixed. Since the results from listdir_matches won't
            # contain the board name, we need to prepend each of the completions.
            for dev in self.boards.boards():
                if abs_match.startswith(dev.name_path):
                    prepend = dev.name_path[:-1]

        paths = sorted(auto(self.boards, listdir_matches, match))
        for path in paths:
            path = prepend + path
            completions.append(escape(path.replace(fixed, '', 1)))
        return completions
Пример #37
0
def findnsave_sale():
    error = None

    #err, areas = _get_findnsave_data( 'findnsave_area', ('area',) )
    areas = [ {'area':'newyork'} ]
    #err, stores = _get_findnsave_data( 'findnsave_store', ('name',) )
    stores = [ { 'name' : 'Walmart' },
               { 'name' : 'Target' },
               { 'name' : 'Toys"R"Us' },
            ]
    err, brands = _get_findnsave_data( 'findnsave_brand', ('name',) )
    err, categorys = _get_categorys()

    if request.method == 'POST':
        keywords = request.form[ 'keywords' ].strip()
        area = request.form[ 'select_area' ]
        store = request.form[ 'select_store' ]
        brand = request.form[ 'select_brand' ]
        category = request.form[ 'select_category' ]
        num = request.form[ 'number' ]
        action = request.form[ 'action' ]

        where = []
        if area != 'All':
            where.append( "`area`=%s" % esql.escape_string(area) )
        if store != 'All':
            where.append( "`retailer`=%s" % esql.escape_string(store) )
        if brand != 'All':
            where.append( "`brand`=%s" % esql.escape_string(brand) )
        if category != 'All':
            where.append( "`category`=%s" % esql.escape_string(category) )

        keywords = keywords.split()
        for kw in keywords:
            where.append( "`name` like %s" %  esql.escape_string('%'+kw+'%') )

        if where:
            where = 'where ' + ' and '.join( where )
        else:
            where = ''

        cols = ( '_ID', 'area', 'name',
                 'retailer', 'brand', 'category',
                 'price', 'priceRegular', 'priceOff', 'priceUtilDate', )

        sql = "select * from `%s` %s" % ( 'findnsave_sale_t', where, )
        if num != 'total':
            sql += ' limit ' + num
        logger.info( sql )
        try:
            db = esql.Database( conf.dbconf )
            data = db.conn.read( sql )
        except Exception as e :
            logger.exception( repr(e) )
            error = repr(e)
            data = []

        for d in data:
            for k in d:
                d[ k ] = unescape( d[ k ] )
                if d[ k ] == '':
                    d[ k ] = 'not specified'

        if action == 'export':
            name = ','.join( [ area, store, brand, num ] )
            return _findnsave_sale_download( name, data, cols + ( 'desc', ) )

        return render_template( 'findnsave_show_sales.html', error = error,
                            areas = areas,
                            stores = stores,
                            brands = brands,
                            categorys = categorys,
                            sales = data,
                            cols = cols )

    return render_template( 'findnsave_show_sales.html', error = error,
                        areas = areas,
                        stores = stores,
                        brands = brands,
                        categorys = categorys,
                        sales = [],
                        cols = [] )
Пример #38
0
    def linker(self, urls):
        for url in urls:
            # Special behaviour for Twitter URLs
            match_twitter_urls = re.compile(
                'http[s]?://(www.)?twitter.com/.+/status/([0-9]+)')

            twitter_urls = match_twitter_urls.findall(url)
            if len(twitter_urls):
                self.tweet(twitter_urls)
                return

            if url.find('gist.github') != -1:
                return

            if randint(1, 5) == 1:
                try:
                    self.commands.get('tweet', self.default)(url)
                except:
                    pass

            fubs = 0
            title = "Couldn't get title"

            site = Browse(url)

            if site.error:
                self.chat('Total fail: %s' % site.error)
                continue

            roasted = shorten(url)
            if not roasted:
                roasted = "Couldn't roast"
                fubs += 1

            try:
                ext = site.headers()['content-type'].split('/')[1]
            except:
                ext = False

            images = [
                'gif',
                'png',
                'jpg',
                'jpeg',
            ]

            if ext in images:
                title = 'Image'
                # Switch this to a Browse method
                if STORE_IMGS:
                    fname = url.split('/').pop()
                    path = IMGS + fname
                    self.butler.do(savefromweb, (url, path, self.lastsender),
                                   'Thumb @ %s')

            elif ext == 'pdf':
                title = 'PDF Document'

            else:
                title = site.title()

            # If you have a delicious account set up. Yes, delicious
            # still exists. Could be updated to a cooler link
            # collecting service.
            if STORE_URLS:
                postdelicious(url, title, self.lastsender)

            if fubs == 2:
                self.chat("Total fail")
            else:
                self.chat("%s @ %s" % (unescape(title), roasted))
Пример #39
0
    def search_site(self, url, resource_dict):
        """Downloads the URL's content, searches for the paths and patterns
        and builds a message out of the matched data.

        Arguments: resource_dict contains the paths, patterns and additional data for
        the url.
        """

        if self.sitedata is None:
            return

        # retrieve content
        try:
            content = download_page(url).decode(WEB_ENCODING, "replace")
        except:
            return
        if content is None:
            return

        message = None
        title = None

        def info_xpath():
            # try to find info using xpath
            root = lxml.html.fromstring(content)
            items = root.xpath(info["xpath"])
            logger.debug("using xpath: " + info["xpath"])
            if items is not None and len(items) >= 1:
                return items[0]
            else:
                return None

        def info_regex():
            # try to find info using a regex pattern
            logger.debug("using regex: " + info["pattern"])
            match = re.search(info["pattern"], content)
            if match is None:
                logger.warning(
                    "Could not find info! (match == None) with pattern: " +
                    info["pattern"])
                return None
            if match.groups() is None:
                logger.warning("match.groups() was None")
                return None
            if len(match.groups()) <= 0:
                logger.warning("Found match but no groups")
                return None

            return match.group(1)

        for info in resource_dict["patterns"]:
            if not "pattern" in info and not "xpath" in info:
                logger.error(
                    "siteinfo entry does not contain a path or pattern!")
                break

            infodata = None
            # try regex first because it seems to be faster
            if "pattern" in info:
                infodata = info_regex()
            # try xpath if there was no pattern or regex was unsuccessful
            if infodata is None and "xpath" in info:
                infodata = info_xpath()

            if infodata is None:
                logger.warning("infodata was None!")
                break

            logger.debug("\ninfodata:\n")
            logger.debug(infodata)

            if infodata is None or infodata == "":
                continue

            logger.info("found info data: " + infodata)
            infodata = unescape(infodata)
            infodata = escape(infodata)

            infodata = infodata.strip()
            if title is None:
                title = infodata

            color = REST_COLOR
            style = REST_STYLE
            if message is None:
                message = ""
                color = FIRST_COLOR
                style = FIRST_STYLE
            message += self.msg_formats.get(
                style, self.msg_formats.get(color, infodata))
            if info != resource_dict["patterns"][-1]:
                message += " " + self.sitedata["separator"] + " "

        # cut last separator if there is one
        sep = self.sitedata["separator"]
        if message is not None and message.strip()[-len(sep):] == sep:
            message = message.strip()[:-len(sep)].strip()

        return message, title
Пример #40
0
 def render(self):
     self.body_html = _get_markup_formatter()(self.body)
     # Remove tags which was generated with the markup processor
     text = strip_tags(self.body_html)
     # Unescape entities which was generated with the markup processor
     self.body_text = unescape(text)
Пример #41
0
def create_report(model):
    """Create txt and html reports based on model values"""
    jinja_environment = jinja2.Environment(
        loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
        extensions=['jinja2.ext.autoescape'],
        autoescape=True)

    q_countries, q_dates, q_queries = countries_dates_queries(model, 'downloads')
    s_countries, s_dates, s_queries = countries_dates_queries(model, 'searches')

    try:
        m_year_downloads = model['year']['downloads']
        m_year_records = model['year']['records']
    except KeyError:
        m_year_downloads = 'No data'
        m_year_records = 'No data'
    try:
        m_year_searches = model['year']['searches']
        m_year_s_records = model['year']['s_records']
    except KeyError:
        m_year_searches = 'No data'
        m_year_s_records = 'No data'
    try:
        m_hist_downloads = model['history']['downloads']
        m_hist_records = model['history']['records']
    except KeyError:
        m_hist_downloads = 'No data'
        m_hist_records = 'No data'
    try:
        m_hist_searches = model['history']['searches']
        m_hist_s_records = model['history']['s_records']
    except KeyError:
        m_hist_searches = 'No data'
        m_hist_s_records = 'No data'

    template_values = {
        # General values
        'inst': model['inst'],
        'resname': model['col'],
        'time_lapse': model['report_month_string'],
        'generated': model['created_at'],
        # Downloads
        'downloads': model['downloads']['downloads'],
        'total_downloads': model['downloads']['downloads_period'],
        'records': model['downloads']['records'],
        'total_records': model['downloads']['records_period'],
        'unique_records': model['downloads']['records_unique'],
        'len_countries': len(model['downloads']['countries_list']),
        'countries': q_countries,
        'query_dates': q_dates,
        'queries': q_queries,
        # Searches
        'searches': model['searches']['searches'],
        'records_searched': model['searches']['records'],
        's_len_countries': len(model['searches']['countries_list']),
        's_countries': s_countries,
        's_query_dates': s_dates,
        's_queries': s_queries,
        # Cumulative data
        'year_downloads': m_year_downloads,
        'year_records': m_year_records,
        'year_searches': m_year_searches,
        'year_s_records': m_year_s_records,
        'history_downloads': m_hist_downloads,
        'history_records': m_hist_records,
        'history_searches': m_hist_searches,
        'history_s_records': m_hist_s_records
    }

    template_txt = jinja_environment.get_template('template.txt')
    report_txt = unescape(template_txt.render(template_values))
    template_html = jinja_environment.get_template('template.html')
    report_html = template_html.render(template_values)

    return report_txt, report_html
Пример #42
0
    def search_site(self, url, resource_dict):
        """Downloads the URL's content, searches for the paths and patterns
        and builds a message out of the matched data.

        Arguments: resource_dict contains the paths, patterns and additional data for
        the url.
        """

        if self.sitedata is None:
            return

        # retrieve content
        try:
            content = download_page(url).decode(WEB_ENCODING, "replace")
        except:
            return
        if content is None:
            return

        message = None
        title = None

        def info_xpath():
            # try to find info using xpath
            root = lxml.html.fromstring(content)
            items = root.xpath(info["xpath"])
            logger.debug("using xpath: " + info["xpath"])
            if items is not None and len(items) >= 1:
                return items[0]
            else:
                return None

        def info_regex():
            # try to find info using a regex pattern
            logger.debug("using regex: " + info["pattern"])
            match = re.search(info["pattern"], content)
            if match is None:
                logger.warning("Could not find info! (match == None) with pattern: " + info["pattern"])
                return None
            if match.groups() is None:
                logger.warning("match.groups() was None")
                return None
            if len(match.groups()) <= 0:
                logger.warning("Found match but no groups")
                return None

            return match.group(1)

        for info in resource_dict["patterns"]:
            if not "pattern" in info and not "xpath" in info:
                logger.error("siteinfo entry does not contain a path or pattern!")
                break

            infodata = None
            # try regex first because it seems to be faster
            if "pattern" in info:
                infodata = info_regex()
            # try xpath if there was no pattern or regex was unsuccessful
            if infodata is None and "xpath" in info:
                infodata = info_xpath()

            if infodata is None:
                logger.warning("infodata was None!")
                break

            logger.debug("\ninfodata:\n")
            logger.debug(infodata)

            if infodata is None or infodata == "":
                continue

            logger.info("found info data: " + infodata)
            infodata = unescape(infodata)
            infodata = escape(infodata)

            infodata = infodata.strip()
            if title is None:
                title = infodata

            color = REST_COLOR
            style = REST_STYLE
            if message is None:
                message = ""
                color = FIRST_COLOR
                style = FIRST_STYLE
            message += self.msg_formats.get(style, self.msg_formats.get(color, infodata))
            if info != resource_dict["patterns"][-1]:
                message += " " + self.sitedata["separator"] + " "

        # cut last separator if there is one
        sep = self.sitedata["separator"]
        if message is not None and message.strip()[-len(sep) :] == sep:
            message = message.strip()[: -len(sep)].strip()

        return message, title