Пример #1
0
    def settings(self, stype):
        BASE = 'http://weibo.cn/dpool/ttt/'
        """stype: school, birth , company, tag, intro"""
        link = BASE + self.current_page.findAll(
            href=re.compile('setting'))[0]['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()
        BASE = 'http://weibo.cn'
        link = BASE + self.current_page.findAll(
            href=re.compile(stype))[0]['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()
        if stype == 'school':
            return self.set_school()

        elif stype == 'company':
            return self.set_company()

        elif stype == 'birth':
            return self.set_birth()

        elif stype == 'tag':
            pass

        elif stype == 'intro':
            pass

        print self.current_page
Пример #2
0
 def set_company(self):
     BASE = 'http://weibo.cn'
     company_kw = ['有限', '公司', '集团', '电子', '外贸', '上海', '北京', '银行']
     data = {}
     submit_link = BASE + self.current_page.findAll('go')[0]['href']
     for postfield in self.current_page.findAll('postfield'):
         data[str(postfield['name'])] = postfield['value'].encode(
             'utf8', 'ignore')
     data['keyword'] = random.choice(company_kw)
     self.current_page = self.xml_submit(submit_link, data)
     self.rand_sleep()
     company = random.choice(
         self.current_page.findAll(href=re.compile('scn=')))
     link = BASE + company['href']
     name = company.text
     self.current_page = BSS(self.br.open(link).read())
     data = {}
     submit_link = BASE + self.current_page.findAll('go')[0]['href']
     for postfield in self.current_page.findAll('postfield'):
         data[str(postfield['name'])] = postfield['value'].encode(
             'utf8', 'ignore')
     data['scremark'] = ''
     start = random.randint(2004, 2011)
     end = random.randint(start, 2011)
     data['scend'] = str(end)
     data['scstart'] = str(start)
     self.rand_sleep()
     self.current_page = self.xml_submit(submit_link, data)
     if self.current_page.findAll(href=re.compile('subact=del')) != None:
         return (name, start)
     else:
         return (0)
Пример #3
0
 def wotd(self, irc, msg, args):
     """
     returns Merriam-Webster's Word of the Day,
     including link to mp3 audio usage example
     """
     try:
         idx = args[0]
     except:
         idx = 0
     r = Request('http://www.merriam-webster.com/word/index.xml')
     doc = urlopen(r)
     html = doc.read()
     soup = BSS(html, convertEntities=BSS.XML_ENTITIES)
     item = soup.findAll('item')[int(idx)]
     mp3url = tinyurl(item.enclosure['url'])
     itemurl = tinyurl(item.link.string)
     # description is HTML in a CDATA section
     dsoup = BS(item.description.string, convertEntities=BS.HTML_ENTITIES)
     summary = ''.join(dsoup.findAll(text=True))
     summary = re.sub('\s+', ' ', summary)
     match = re.search('\d{2}, \d+ is: (.+?) Example sentence:', summary,
                       re.I | re.M | re.S)
     worddef = match.group(1).encode('ascii', 'ignore')
     worddef = re.sub('^\s*(?P<wotd>[\w\s]+)', '\g<wotd>: ', worddef)
     resp = '%s (audio:%s, link:%s)' % (worddef, mp3url, itemurl)
     irc.reply(resp, prefixNick=False)
Пример #4
0
 def _fetch_xml(self, function, query):
     url = "http://api.wunderground.com/auto/wui/geo/%sXML/index.xml?%s" % (
         function, urlencode({'query': query}))
     print url
     doc = web.getUrl(url, headers=HEADERS)
     # Wunderground double-encodes some of its entities, so we'll double-decode.
     return BSS(doc, convertEntities=BSS.HTML_ENTITIES)
Пример #5
0
def PhotoMenu():
    oc = ObjectContainer(title2="Photos")

    for item in XML.ElementFromURL(RSS_FEED).xpath('//item'):

        url = item.find('link').text
        title = item.find('title').text
        date = Datetime.ParseDate(item.find('pubDate').text)

        thumb = R(ICON)
        try:
            thumb = FindPhotos(
                item.xpath('c:encoded', namespaces=PHOTO_NS)[0].text)[0]
        except:
            continue

        summary = item.xpath('description')[0].text.replace('<p>', '').replace(
            '</p>', '').replace('<br />', "\n").replace(' [...]', '...')
        soup = BSS(summary, convertEntities=BSS.HTML_ENTITIES)
        summary = soup.contents[0]

        # Technically, I should use the url parameter of the PhotoAlbumObject to perform a service lookup.
        # However, this currently introduces an additional level in the structure which is undesired.
        # Therefore, I'm doing this all manually.
        oc.add(
            PhotoAlbumObject(key=Callback(PhotoList, url=url, title=title),
                             rating_key=url,
                             title=title,
                             thumb=thumb,
                             originally_available_at=date))

    return oc
Пример #6
0
def parse_links(contents, rel):
    """Define a helper function for parsing feed links."""
    strainer = SoupStrainer('link', rel=rel)
    entries = [
        tag for tag in BSS(
            contents, parseOnlyThese=strainer, selfClosingTags=['link'])
    ]
    return entries
Пример #7
0
 def login(self, usr_name, passwd):
     self.passwd = passwd
     self.current_page = BSS(self.br.open(HOME_SOHU).read())
     login_link = HOME_SOHU + self.br.find_link(
         text_regex=re.compile(LOGIN_TXT)).url
     self.current_page = BSS(self.br.open(login_link).read())
     submit_link = HOME_SOHU + self.current_page.find('form')['action']
     data = {}
     for postfield in self.current_page.findAll('input'):
         if postfield['type'] not in ['button', 'submit']:
             data[str(postfield['name'])] = postfield['value'].encode(
                 'utf8', 'ignore')
     data['u'] = usr_name
     data['p'] = passwd
     data['fr'] = 'null'
     self.current_page = self.xml_submit(submit_link, data)
     self.rand_sleep()
Пример #8
0
 def login(self, usr_name, passwd):
     self.passwd = passwd
     self.current_page = BSS(self.br.open(HOME_SINA).read())
     login_link = self.current_page('a', limit=1)[0]['href']
     self.current_page = BSS(self.br.open(login_link).read())
     submit_link = BASE + self.current_page.find('go')['href']
     data = {}
     for postfield in self.current_page.findAll('postfield'):
         if postfield['value'].encode('utf8', 'ignore') != '$(password)':
             data[str(postfield['name'])] = postfield['value'].encode(
                 'utf8', 'ignore')
         else:
             data[str(postfield['name'])] = passwd
     data['remember'] = '1'
     data['mobile'] = usr_name
     self.current_page = self.xml_submit(submit_link, data)
     home_link = self.current_page.find('a')['href']
     self.current_page = BSS(self.br.open(home_link).read())
     self.rand_sleep()
     return 1
Пример #9
0
 def change_pass(self, passwd):
     data = {}
     link = HOME_SOHU + self.br.find_link(
         text_regex=re.compile(SETTING_TXT)).url
     self.current_page = BSS(self.br.open(link).read())
     self.rand_sleep()
     link = HOME_SOHU + self.current_page.findAll(
         href=re.compile('upass'))[0]['href']
     self.current_page = BSS(self.br.open(link).read())
     for postfield in self.current_page.findAll('input'):
         if postfield['type'] not in ['button', 'submit']:
             data[str(postfield['name'])] = postfield['value'].encode(
                 'utf8', 'ignore')
     data['password'] = self.passwd
     data['newpass'] = passwd
     # BASE = 'http://weibo.cn/dpool/ttt/'
     submit_link = HOME_SOHU + self.current_page.find('form')['action']
     self.current_page = self.xml_submit(submit_link, data)
     print self.current_page
     self.rand_sleep()
Пример #10
0
    def set_school(self):
        BASE = 'http://weibo.cn'

        link = BASE + self.current_page.findAll(
            href=re.compile('subact=search'))[0]['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()

        link = BASE + self.current_page.findAll(
            href=re.compile('stype=1'))[0]['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()

        link = BASE + self.current_page.findAll(
            href=re.compile('provid=31'))[0]['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()

        link = BASE + random.choice(
            self.current_page.findAll(href=re.compile('scn=')))['href']
        self.current_page = BSS(self.br.open(link).read())
        self.rand_sleep()

        data = {}
        submit_link = BASE + self.current_page.findAll('go')[0]['href']
        for postfield in self.current_page.findAll('postfield'):
            data[str(postfield['name'])] = postfield['value'].encode(
                'utf8', 'ignore')
        name = data['scname']
        data['scremark'] = ''
        start = random.randint(1995, 2004)
        data['scstart'] = str(start)
        self.current_page = self.xml_submit(submit_link, data)
        if self.current_page.findAll(href=re.compile('subact=del')) != None:
            return (name, start)
        else:
            return (0)
Пример #11
0
def PhotoMenu():
    dir = MediaContainer(viewGroup='Details', title2="Photos")
    xml = HTTP.Request(RSS_FEED).content.replace('media:content', 'content')
    for item in XML.ElementFromString(xml).xpath('//item'):
        title = item.find('title').text
        summary = item.xpath('description')[0].text.replace('<p>', '').replace(
            '</p>', '').replace('<br />', "\n").replace(' [...]', '...')
        soup = BSS(summary, convertEntities=BSS.HTML_ENTITIES)
        summary = soup.contents[0]
        date = Datetime.ParseDate(
            item.find('pubDate').text).strftime('%a %b %d, %Y')
        thumb = item.xpath('content', namespaces=PHOTO_NS)[0].get('url')
        dir.Append(
            Function(DirectoryItem(PhotoList, title, date, summary, thumb),
                     key=item.find('link').text))
    return dir
Пример #12
0
    def play(self, page, mode=''):
        if Debug: self.LOG('DEBUG: _play()\nurl: %s' % page)
        # Get current list item details...
        title = unicode(xbmc.getInfoLabel("ListItem.Title"), "utf-8")
        thumbnail = xbmc.getInfoImage("ListItem.Thumb")
        plot = unicode(xbmc.getInfoLabel("ListItem.Plot"), "utf-8")

        if mode == 'smil':
            smil = BSS(self._get(page))
            rtmp = smil.meta['base']
            video = smil.video['src']
            swfUrl = 'http://medici.tv/medici.swf'
            # rtmpdump script for console use
            rtmpdump = "rtmpdump -r %s --swfUrl http://medici.tv/medici.swf --tcUrl '%s' --playpath '%s' -o '%s.mp4'" % \
                        (rtmp, rtmp, saxutils.unescape(video), saxutils.unescape(title))
            # Build rtmp url...
            video_url = rtmp + ' swfUrl=' + swfUrl + ' tcUrl=' + rtmp + ' playpath=' + saxutils.unescape(
                video)
            if Debug:                self.LOG('DEBUG: rtmp link details.\n\trtmp: %s\n\tswfUrl: %s\n\ttcUrl: %s\n\tplaypath: %s\n\trtmpdump: %s' % \
                         (rtmp, swfUrl, rtmp, saxutils.unescape(video), rtmpdump))
        elif mode == 'rtmp_daily':
            video_url = page.split('&rtmp=1')[0]
            if Debug:
                self.LOG('DEBUG: video link details.\n\turl: %s' % video_url)
        else:
            video_url = ''
            if Debug: self.LOG('DEBUG: no video link!')
            raise
        # only need to add label, icon and thumbnail, setInfo() and addSortMethod() takes care of label2
        listitem = xbmcgui.ListItem(title,
                                    iconImage="DefaultVideo.png",
                                    thumbnailImage=thumbnail)
        # set listitem information
        listitem.setInfo('video', {
            'title': title,
            'label': title,
            'plot': plot,
            'plotoutline': plot,
        })
        # Play video...
        xbmc.Player().play(video_url, listitem)
Пример #13
0
 def recode(text):
     return BSS(text.encode('utf8', 'ignore'),
                convertEntities=BSS.HTML_ENTITIES)
Пример #14
0
def parse_ids(contents):
    """Define a helper function for parsing ids."""
    strainer = SoupStrainer('id')
    ids = [tag for tag in BSS(contents, parseOnlyThese=strainer)]
    return ids
Пример #15
0
 def _extract_text(self, node):
     return (BSS(' '.join(node.findAll(text=True)),
                 convertEntities=BSS.HTML_ENTITIES).find(text=True))
Пример #16
0
def parse_entries(contents):
    """Define a helper function for parsing feed entries."""
    strainer = SoupStrainer('entry')
    entries = [tag for tag in BSS(contents, parseOnlyThese=strainer)]
    return entries
Пример #17
0
from BeautifulSoup import BeautifulStoneSoup as BSS
import codecs
import sys, os
streamWriter = codecs.lookup('utf-8')[-1]
sys.stdout = streamWriter(sys.stdout)

inf = open(sys.argv[1], "rb").read()
try:
    beta = float(sys.argv[2])
except:
    beta = 0.1

soup = BSS(inf)

segs = soup.findAll(lambda t: t.name == u'seg')

tot_paraphrases = 0.0
tot_segs = 0.0

for seg in segs:
    tot_segs += 1
    if seg.get('complete') == 'true':
        best = seg.find(lambda p: p.name == 'best')
        ref = seg.ref.find(text=True)
        eye_dee = seg.get(u'id')
        paraphrases = set([
            p.find(text=True)
            for p in seg.findAll(lambda e: e.name in [u'best', u'next'])
        ])
        tot_paraphrases += len(paraphrases)
        sys.stdout.write(ref + u' ||| ' + u' <-> '.join(paraphrases))
Пример #18
0
def tinyurl(url):
    r = Request('http://tinyurl.com/api-create.php?url=%s' % url)
    doc = urlopen(r)
    soup = BSS(doc)
    return str(soup)
Пример #19
0
 def follow(self, user_id, st):
     if user_id != 0:
         self.current_page = BSS(
             self.br.open(FOLLOW_LINK % (st, user_id)).read())
         self.rand_sleep()
         return 1
Пример #20
0
 def xml_submit(self, url, data):
     return BSS(self.br._mech_open(url, urllib.urlencode(data)).read())
Пример #21
0
            s = w.strip().decode('ascii')
            words.append(s)
        except Exception:
            counter += 1
    print "\t%d words contained non ascii characters and are ommited" % counter

    articles[k_word] = {}
    # the wikipedia api restricts queries to a length of 50
    print "\tfound %d words in file" % len(words)
    for i in range((len(words) / 50) + 1):

        # create the query and parse it
        query = query_base % "|".join(words[(i * 50):(i * 50) + 50])

        text = myopener.open(query).read()
        soup = BSS(text, convertEntities=BSS.ALL_ENTITIES)
        cont = soup.api.query

        # collect all missing words
        missing = cont.pages.findAll(missing=True)
        all_missing.append([m['title'] for m in missing])

        # create dict containing all data from the available articles
        for page in cont.pages.findAll(missing=None):
            print 'title: ' + page['title']
            title = page['title']
            data = {}

            # check whether article was found through redirect
            if cont.redirects:
                redir = cont.redirects.findAll(to=title)
Пример #22
0
 def home(self):
     self.current_page = BSS(self.br.open(HOME_SOHU).read())
     self.rand_sleep()