示例#1
0
 def add_to_log(self, action, info=None, level="info"):
     log = {}
     log['action'] = strip_tags(action)
     log['info'] = strip_tags(info)
     log['level'] = strip_tags(level)
     log['created'] = int(time.time())
     self.db.logs.insert(log, safe=True)
示例#2
0
 def add_to_log(self, action, info=None, level="info"):
     log = {}
     log['action'] = strip_tags(action)
     log['info'] = strip_tags(info)
     log['level'] = strip_tags(level)
     log['created'] = int(time.time())
     self.db.logs.insert(log, safe=True)
示例#3
0
 def add_to_log(self, action, info=None, level="info"):
     log = {}
     log["action"] = strip_tags(action)
     log["info"] = strip_tags(info)
     log["level"] = strip_tags(level)
     log["created"] = int(time.time())
     self.db.logs.insert(log)
示例#4
0
def make_sheet_list_by_tag():
    """
	Returns an alphabetized list of tags and sheets included in each tag.
	"""
    tags = {}
    results = []

    sheet_list = db.sheets.find({"status": {"$in": LISTED_SHEETS}})
    for sheet in sheet_list:
        sheet_tags = sheet.get("tags", [])
        for tag in sheet_tags:
            if tag not in tags:
                tags[tag] = {"tag": tag, "count": 0, "sheets": []}
            tags[tag]["sheets"].append(
                {"title": strip_tags(sheet["title"]), "id": sheet["id"], "views": sheet["views"]}
            )
            tags[tag]["count"] += 1

    for tag in tags.values():
        tag["sheets"] = sorted(tag["sheets"], key=lambda x: -x["views"])
        results.append(tag)

    results = sorted(results, key=lambda x: x["tag"])

    return results
示例#5
0
def main():
    users = mongo.get_users(collection)
    for user in users:
        chat_id = user['user_id']
        oauth = user['lepra_oauth']
        feed_type = user.get('feed_type', 'main')
        threshold_rating = user.get('threshold_rating', 'easy')
        markpost_read = user.get('markpost_read', 'false')
        feed = get_feed(oauth, feed_type, threshold_rating)
        if not feed:
            continue
        if feed == 'deny':
            telegram_bot.get_user_oauth(chat_id, client_id, bot)
            config.logger.error(
                "Some auth error. User {}, move to prepare".format(chat_id))
            mongo.user_to_prepare(chat_id, collection)
            continue
        for key in feed:
            for post in feed[key]:
                send_to_user = ''
                post_id = post['id']
                config.logger.error("User id: {}".format(chat_id))
                config.logger.error("Post id: {}".format(post_id))
                read = mongo.check_lepra_post(post_id, chat_id,
                                              posts_collection)
                if read:
                    config.logger.error("User {} already read post: {}".format(
                        chat_id, post_id))
                    continue
                for key in post:
                    if key == 'body':
                        data = post[key]
                        data = util.strip_tags(data)
                        send_to_user = send_to_user + data + '\n'
                    elif key == '_links':
                        data = post[key][0]['href']
                        send_to_user = send_to_user + data
                if send_to_user:
                    config.logger.error("Send post {} to user {}".format(
                        post_id, chat_id))
                    time.sleep(1)
                    result = telegram_bot.send_message(send_to_user, 'text',
                                                       bot, chat_id)
                    if result:
                        config.logger.error("result is: {}".format(result))
                        if result == 'ban':
                            config.logger.error(
                                "User {} blocked bot, move to prepare".format(
                                    chat_id))
                            mongo.user_to_prepare(chat_id, collection)
                            continue
                        else:
                            mongo.add_to_lepra_posts(post['id'], chat_id,
                                                     posts_collection)
                            if markpost_read == 'true':
                                markpost_as_read(post_id, oauth)
示例#6
0
文件: main.py 项目: newslynx/lauteur
def from_string(search_str):
  """
  Takes a candidate string and
  extracts out the name(s) in list form
  >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV'
  >>> authors_from_string(string)
  ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV']
  """
  # set initial counter
  initial_count = 0

  # clean string
  search_str = strip_tags(search_str)
  search_str = re_by.sub('', search_str)
  search_str = search_str.strip()

  # tokenize
  name_tokens = [ s.strip() for s in re_name_token.split(search_str) ]

  _authors, authors = [], []
  curname = [] # List of first, last name tokens

  for token in name_tokens:

    # check if the length of the name 
    # and the token suggest an initial
    if is_initial(curname, token):

      # upper case initial & increment
      token = token.upper()
      initial_count +=1

    # if we're at a delimiter, check if the name is complete
    if token.lower() in DELIM:

      # check valid name based on initial count
      if end_name(curname, initial_count):
        _authors.append(' '.join(curname))

        # reset
        initial_count = 0
        curname = []

    # otherwise, append token
    elif not re_digits.search(token):
      curname.append(token)

  # One last check at end
  valid_name = (len(curname) >= MIN_NAME_TOKENS)
  if valid_name:
    _authors.append(' '.join(curname))

  return format_authors(_authors)
示例#7
0
	def run_trancxu(self, args):
		if not args:
			return "Necesas specifi vorton. Por helpo vidu %s" % self.help_url("tranĉu")
		word = urllib.quote(util.x_to_unicode(args))
		url = config.sivo_search % ("ser%c4%89o", word)
		html = util.get_html(url)
		html = re.search(r"<h2>Vortfarada Serĉo</h2>(.+?)<h2>", html, re.S).group(1)
		if "Neniu trovita" in html:
			return 'Nenio trovita por "%s".' % args
		else:
			ret = [util.strip_tags(line) for line in html.splitlines() if "<li>" in line]
			return "\n".join(ret) 
示例#8
0
文件: grcl.py 项目: askedrelic/grcl
    def _displayEntry(self, index):
        entry = self.container.items[index-1]
        urls =  util.find_urls(entry.content)
        title = util.unescape(entry.title).replace("\n", ' ').encode('utf-8')
        content = util.strip_tags(util.unescape(entry.content)).encode('utf-8')

        print title
        print content

        #uniqify the urls
        for i in list(set(urls)):
            print ''.join(i)
示例#9
0
def from_string(search_str):
    """
  Takes a candidate string and
  extracts out the name(s) in list form
  >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV'
  >>> authors_from_string(string)
  ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV']
  """
    # set initial counter
    initial_count = 0

    # clean string
    search_str = strip_tags(search_str)
    search_str = re_by.sub('', search_str)
    search_str = search_str.strip()

    # tokenize
    name_tokens = [s.strip() for s in re_name_token.split(search_str)]

    _authors, authors = [], []
    curname = []  # List of first, last name tokens

    for token in name_tokens:

        # check if the length of the name
        # and the token suggest an initial
        if is_initial(curname, token):

            # upper case initial & increment
            token = token.upper()
            initial_count += 1

        # if we're at a delimiter, check if the name is complete
        if token.lower() in DELIM:

            # check valid name based on initial count
            if end_name(curname, initial_count):
                _authors.append(' '.join(curname))

                # reset
                initial_count = 0
                curname = []

        # otherwise, append token
        elif not re_digits.search(token):
            curname.append(token)

    # One last check at end
    valid_name = (len(curname) >= MIN_NAME_TOKENS)
    if valid_name:
        _authors.append(' '.join(curname))

    return format_authors(_authors)
示例#10
0
 def test_strip_tags(self):
     self.assertEqual('', util.strip_tags(''))
     self.assertEqual('ac', util.strip_tags('a<b>c'))
     self.assertEqual('a<b', util.strip_tags('a<b'))
     self.assertEqual('a>b', util.strip_tags('a>b'))
     self.assertEqual('ace', util.strip_tags('a<b>c<d>e'))
     self.assertEqual('>ace<', util.strip_tags('>a<b>c<d>e<'))
示例#11
0
 def test_strip_tags(self):
     self.assertEqual('', util.strip_tags(''))
     self.assertEqual('ac', util.strip_tags('a<b>c'))
     self.assertEqual('a<b', util.strip_tags('a<b'))
     self.assertEqual('a>b', util.strip_tags('a>b'))
     self.assertEqual('ace', util.strip_tags('a<b>c<d>e'))
     self.assertEqual('>ace<', util.strip_tags('>a<b>c<d>e<'))
示例#12
0
 def synopsis(self):
     """scraps the synopsis from the show's tvrage page using a regular 
     expression. This method might break when the page changes. unfortunatly 
     the episode summary isnt available via one of the xml feeds"""
     try:
         page = urlopen(self.link).read()
         try:
             summary = re.search(r'<div class="show_synopsis">(.*?)</div>',
                                 page, re.MULTILINE | re.DOTALL).group(1)
             return unicode(strip_tags(summary), 'utf-8').strip()
         except Exception, e:
             print('Show.synopsis: %s, %s' % (self, e))
     except URLError, e:
         print('Show.synopsis:urlopen: %s, %s' % (self, e))
示例#13
0
 def summary(self):
     """scraps the plot summary episode's tvrage page using a regular 
     expression this method might break when the page changes. unfortunatly 
     the episode summary isnt available via one of the xml feeds"""
     try:
         page = urlopen(self.link).read()
         if not 'Click here to add a summary' in page:
             try:
                 summary = re.search(r"</script></div><div>(.*?)<br>", page,
                                     re.MULTILINE | re.DOTALL).group(1)
                 return unicode(strip_tags(summary), 'utf-8').strip()
             except Exception, e:
                 print('Episode.summary: %s, %s' % (self, e))
     except URLError, e:
         print('Episode.summary:urlopen: %s, %s' % (self, e))
示例#14
0
文件: api.py 项目: Berimor66/mythbox
 def synopsis(self):
     """scraps the synopsis from the show's tvrage page using a regular 
     expression. This method might break when the page changes. unfortunatly 
     the episode summary isnt available via one of the xml feeds"""
     try:
         page = urlopen(self.link).read()
         try:
             summary = re.search(
             r'<div class="show_synopsis">(.*?)</div>', page,
                 re.MULTILINE | re.DOTALL).group(1)
             return unicode(strip_tags(summary), 'utf-8').strip()
         except Exception, e:
             print('Show.synopsis: %s, %s' % (self, e))
     except URLError, e:
         print('Show.synopsis:urlopen: %s, %s' % (self, e))
示例#15
0
文件: api.py 项目: Berimor66/mythbox
 def summary(self):
     """scraps the plot summary episode's tvrage page using a regular 
     expression this method might break when the page changes. unfortunatly 
     the episode summary isnt available via one of the xml feeds"""
     try:
         page = urlopen(self.link).read()
         if not 'Click here to add a summary' in page:
             try:
                 summary = re.search(
                 r"</script></div><div>(.*?)<br>", page,
                     re.MULTILINE | re.DOTALL).group(1)
                 return unicode(strip_tags(summary), 'utf-8').strip()
             except Exception, e:
                 print('Episode.summary: %s, %s' % (self, e))
     except URLError, e:
         print('Episode.summary:urlopen: %s, %s' % (self, e))
示例#16
0
	def trans_majstro(self, fr, to, word):
		qword = urllib.quote(word)
		url = config.majstro_search % (fr, to, qword)
		html = util.get_html(url)
		if "could not be translated" in html:
			return 'Nenio trovita por "%s".' % word
		results = re.findall(r"<li>.+?</li>", html)
		ret = "\n".join(results)
		ret = util.strip_tags(ret)
	
		parser = HTMLParser.HTMLParser()
		ret = ret.decode('utf-8')
		ret = parser.unescape(ret)
		if type(ret) == unicode:
			ret = ret.encode('utf-8')

		ret = re.sub(": ", " → ", ret)
		ret = re.sub("; ", ", ", ret)
		return ret
示例#17
0
def source_text(source):
    """
    Recursive function to translate a source dictionary into text.
    """
    content = [
        source.get("customTitle", ""),
        source.get("ref", ""),
        source.get("text", {"he": ""}).get("he", ""),
        source.get("text", {"en": ""}).get("en", ""),
        source.get("comment", ""),
        source.get("outside", ""),
        ]
    content = [strip_tags(c) for c in content]
    text = " ".join(content)

    if "subsources" in source:
        for s in source["subsources"]:
            text += source_text(s)

    return text
示例#18
0
def source_text(source):
    """
    Recursive function to translate a source dictionary into text.
    """
    content = [
        source.get("customTitle", ""),
        source.get("ref", ""),
        source.get("text", {"he": ""})["he"],
        source.get("text", {"en": ""})["en"],
        source.get("comment", ""),
        source.get("outside", ""),
    ]
    content = [strip_tags(c) for c in content]
    text = " ".join(content)

    if "subsources" in source:
        for s in source["subsources"]:
            text += source_text(s)

    return text
示例#19
0
def main():
    # print(collection)
    users = mongo.get_users(collection)
    # print(type(bot))
    for user in users:
        # print(user)
        chat_id = user['user_id']
        oauth = user['lepra_oauth']
        feed_type = user.get('feed_type', 'main')
        threshold_rating = user.get('threshold_rating', 'easy')
        feed = get_feed(oauth, feed_type, threshold_rating)
        # print(feed)
        if feed == 'deny':
            telegram_bot.get_user_oauth(chat_id, client_id, bot)
            continue
        for key in feed:
            # print key
            for post in feed[key]:
                send_to_user = ''
                post_id = post['id']
                config.logger.debug("User id: {}".format(chat_id))
                config.logger.debug("Post id: {}".format(post_id))
                read = mongo.check_lepra_post(post_id, chat_id, posts_collection)
                if read:
                    config.logger.debug("User {} already read post: {}".format(chat_id, post_id))
                    continue
                for key in post:
                    if key == 'body':
                        data = post[key]
                        data = util.strip_tags(data)
                        send_to_user = send_to_user + data + '\n'
                    elif key == '_links':
                        data = post[key][0]['href']
                        send_to_user = send_to_user + data
                if send_to_user:
                    result = telegram_bot.send_message(send_to_user, 'text', bot, chat_id)
                    if result:
                        mongo.add_to_lepra_posts(post['id'], chat_id, posts_collection)
示例#20
0
    def html_context(self):
        """
        sets self.anchortext and self.context, where the latter is the
        surrounding text of a link, often containing author, title,
        publication info; returns self.context
        
        There are three main cases:

        (1) The context we're looking for coincides with the content
            of a single DOM element (e.g. the <a> itself, or a <li>),
            possibly minus an abstract, which we can remove
            afterwards.

        (2) The context we're looking for coincides with the content
            of several DOM elements taken together
            (e.g. "<h4>Title</h4> <div>Forthcoming
            <a>Penultimate</a></div>"), possibly minus an abstract.

        (3) The context we're looking for is part of a DOM element
            that also contains contexts for other entries. E.g.,
            "<a>Paper1</a> Forthcoming<br> <a>Paper2</a>", or
            "<h4>Paper1</h4> Forthcoming <a>PDF</a> <h4>Paper2</h4>
            <a>PDF</a>", or "<h4><a>Paper1</a></h4> Forthcoming
            <h4><a>Paper2</a></h4>".

        To tell these apart, we first climb up the DOM tree until we
        reach an element that's too large to be a single paper entry
        (careful of abstracts here). If the element right below (call
        it el) has no further text than the link with which we started
        but there's neighbouring text not in a link, we assume we're
        in case (3); here we crudely divide el's parent by <br> or
        <h*> and return the content of the part surrounding el. To
        tell apart (1) and (2), we use some heuristics to determine
        whether el's context extends to its siblings: e.g., is there a
        gap between el and sibling? does the sibling also contain a
        link to a paper? etc.
        """

        if not self.element:
            raise Exception("need link element to extract html_context")
        self.anchortext = self.element.get_attribute('textContent').strip()
        debug(5, 'trying to find link context')

        # First climb up DOM until we reach an element (par) that's
        # too large:
        el = self.element
        par = el.find_element_by_xpath('..')
        debug(5, 'starting with %s', el.get_attribute('outerHTML'))
        el._text = el.get_attribute('textContent')
        while (True):
            debug(5, 'climbing up par: %s', par.get_attribute('outerHTML'))
            # check if parent has many links or other children
            par._links = par.find_elements_by_xpath('.//a')
            par._children = par.find_elements_by_xpath('./*')
            if len(par._links) > 3 or len(par._children) > 5:
                debug(5, 'stopping: too many links or children')
                break
            # List of drafts may only contain two papers, so we also
            # check if the previous element was already fairly
            # large. (We'll still treat such lists as a single context
            # if the entries are very short, but then that's not a
            # serious problem because we won't be misled by
            # publication info that belongs to another entry.)
            par._text = par.get_attribute('textContent')
            if len(el._text) > 70 and len(par._text) > len(el._text)*1.5:
                debug(5, 'stopping: enough text already (%s)', el._text)
                break
            try:
                gpar = par.find_element_by_xpath('..')
                el,par = par,gpar
            except Exception:
                break
        
        # If el has no further text than the link with which we
        # started but there's neighbouring text not in a link, we're
        # in the messy case (3):
        if len(el._text) - len(self.element._text) < 5:
            par._outerHTML = par.get_attribute('outerHTML')
            el._outerHTML = el.get_attribute('outerHTML')
            l,r = par._outerHTML.split(el._outerHTML, 2)
            if re.search(r'\w\s*$', l) or re.search(r'^\s*\w', r):
                debug(5, 'argh: case (3)')
                for pat in (r'<h\d.*?>', r'<br>\s*<br>', r'<br>'):
                    parts = re.split(pat, par._outerHTML, flags=re.I)
                    if len(parts) > 1:
                        break
                for part in parts:
                    if el._outerHTML in part:
                        debug(5, 'surrounding part: %s', part)
                        return util.strip_tags(part)
                # we should never be here
                return el._text
        
        # Now try to figure out if siblings belong to context:
        def context_left(i):
            if par._children.index(el)-i < 0:
                # can't catch IndexError: careful of negative indices!
                return ''
            lsib = par._children[par._children.index(el)-i]
            lsib_outerHTML = lsib.get_attribute('outerHTML')
            debug(5, "add left sibling?: %s", lsib_outerHTML)
            if re.search(r'\.(?:pdf|docx?)\b', lsib_outerHTML, flags=re.I):
                debug(5, "no: contains link to pdf or doc")
                return ''
            lsib_height = int(lsib.get_attribute('offsetHeight'))
            lsib_text = lsib.get_attribute('textContent')
            if lsib_text.strip() == '' and lsib_height > 2:
                debug(5, "no: sibling has no text but takes up space")
                return ''
            lsib_bottom = lsib.location['y'] + lsib_height
            gap = par._children[par._children.index(el)-(i-1)].location['y'] - lsib_bottom
            if gap > 20 or (gap > 10 and len(context) > 20):
                debug(5, "no: too far away (%s)", gap)
                return ''
            debug(5, "yes, expanding context")
            return lsib_text

        def context_right(i):
            try:
                rsib = par._children[par._children.index(el)+i]
            except IndexError:
                return ''
            rsib_outerHTML = rsib.get_attribute('outerHTML')
            debug(5, "add right sibling?: %s", rsib_outerHTML)
            if re.search(r'\.(?:pdf|docx?)\b', rsib_outerHTML, flags=re.I):
                debug(5, "no: contains link to pdf or doc")
                return ''
            if (len(context) > 20 
                and not re.search(r'\d{4}|draft|forthcoming', rsib_outerHTML, flags=re.I)):
                # We're mainly interested in author, title,
                # publication info. The first two never occur after
                # the link element (unless that is very short: e.g. an
                # icon), so we only need to check for the third.
                debug(5, "no: doesn't look like publication info")
                return ''
            rsib_height = int(rsib.get_attribute('offsetHeight'))
            rsib_text = rsib.get_attribute('textContent')
            if rsib_text.strip() == '' and rsib_height > 2:
                debug(5, "no: sibling has no text but takes up space")
                return ''
            rsiblsib = par._children[par._children.index(el)+(i-1)]
            rsiblsib_bottom = rsiblsib.location['y'] + int(rsiblsib.get_attribute('offsetHeight'))
            gap = rsib.location['y'] - rsiblsib_bottom
            if gap > 20 or (gap > 10 and len(context) > 20):
                debug(5, "no: too far away (%s)", gap)
                return ''
            debug(5, "yes, expanding context")
            return rsib_text

        context = el.get_attribute('textContent')
        debug(5, "initial context: %s", context)
        for i in (1,2,3):
            more = context_right(i)
            if not more: break
            context += '\n' + more
        for i in (1,2,3,4):
            more = context_left(i)
            if not more: break
            context = more + '\n' + context
        # tidy up slightly (mainly for easier testing):
        self.context = re.sub(r'\s*\n+\s*', r'\n', context).strip()
        return self.context
示例#21
0
def do_post(balancer, content):
    balancer.read(strip_tags(content))
示例#22
0
def test_strip_tags():
    src = "<html>\n<body><p>toto titi.</p>\n  tata tutu.</html>"
    expected = "toto titi. tata tutu."
    assert_equals(util.strip_tags(src), expected)
示例#23
0
 def summary(self, length):
     text = util.strip_tags(self.content)
     return util.summarize(text, length)
        as data_file:
    data = json.load(data_file)['docs']
    new_data = []
    for d in data:
        new = {}
        if 'categories' in d:
            cat = []
            for c in d['categories']:
                if 'name' in c:
                    cat.append(c['name'])
            new['rubrics'] = cat

        if 'headline' in d:
            new['headline'] = d['headline']
        if 'summary' in d:
            new['description'] = util.strip_tags(d['summary'])
        if 'address' in d:
            if ''
            address = {}
            new['address'] = address
            if 'loc' in d['address']:
                point = dict(lat=d['address']['loc'][1], lng=d['address']['loc'][0])
                new['geometry'] = dict(point=point)

            address['street'] = d['address'].get('street')
            address['street_number'] = d['address'].get('number')
            address['street_type'] = d['address'].get('street_type')
            address['state'] = d['address'].get('state_name')
            address['country'] = 'Brazil'
            address['neighborhood'] = d['address'].get('neighborhood')
            address['city'] = d['address'].get('city')
示例#25
0
def do_post(balancer, content):
    balancer.read(strip_tags(content))
示例#26
0
    def handle(self, *args, **options):
        Izsek.objects.all().delete()
        izseki = []
        
        for posnetek in Posnetek.objects.all():
            
            if posnetek.podnapisi:
                
                file_path = posnetek.podnapisi.path
                
                vseb = codecs.open(file_path,encoding='utf-8',errors='replace').read()
                
                stripped = strip_tags(vseb) 
                #has_speakers = len(regex.findall(stripped)) > 0 
                vsebina_split = sent_tokenize(stripped,language='slovene')
                #pprint(vsebina_split)
                
                last_cas = ""
                izsek = Izsek()
                izsek.posnetek = posnetek
                
                for split in vsebina_split:
                    if split == "":
                        continue
                    
                    split = split.replace(u"WEBVTT","")
                    nov_cas = [ b.split("-->") for b in split.split('\n') if "-->" in b]
                    split_in = [ b.strip() for b in split.split('\n') if b != "" and "-->" not in b]
                    #pprint(nov_cas)
                    
                    if not last_cas:
                        nov_cas_a = nov_cas.pop(0)
                        last_cas = nov_cas_a[0]
                    
                    if not izsek.zacetek:
                        izsek.zacetek = last_cas
                        
                    sentence = " ".join(split_in).strip()
                    
                    if sentence.startswith((u"-",u"–")) or (len(sentence) + len(izsek.vsebina) > 500):
                        #if len(izsek.vsebina):
                        #    print izsek.zacetek,izsek.vsebina
                        izseki.append(izsek)
                        
                        izsek=Izsek()
                        izsek.vsebina = sentence.upper()
                        izsek.zacetek = last_cas
                        izsek.posnetek = posnetek
                    else:
                        izsek.vsebina += " "+sentence.upper()
                    
                    if nov_cas:
                        nov_cas_a = nov_cas.pop()
                        last_cas = nov_cas_a[0]
                
                if len(izsek.vsebina):         
                    izseki.append(izsek)    
                    
                    #pprint(cas)

            if len(izseki) > 10000:  
                Izsek.objects.bulk_create(izseki)       
                izseki = [] 
                        
        Izsek.objects.bulk_create(izseki)