Пример #1
0
def filename_from_html_content(html):
    trace(8, 'Trying to deduce filename from html-content.')
    programname = ''
    displaydate = find_html_meta_argument(html, 'displaydate')
    # programid = find_html_meta_argument(html, 'programid')

    # change date from 20141210 to 2014-12-10      
    if len(displaydate) == 8:
        displaydate = displaydate[:-4] + '-' +  displaydate[-4:-2] + '-' + displaydate[-2:]

    title = find_html_meta_argument(html, 'og:title')

    idx = title.rfind(' - ')
    if idx < 0:
        trace(8, 'programname is not part of og:title, truing twitter:title')
        title = find_html_meta_argument(html, 'twitter:title')
        idx = title.rfind(' - ')

    if idx > 0:
        programname = title[idx+3:].strip()
        title = title[:idx]
  
    programname = common.unescape_html(programname)
    programname = programname.replace('/', ' ').rstrip(' .,!')
    if programname == 'Lordagsmorgon i P2':
        programname = 'Lordagsmorgon'
    trace(7, 'programname is ' + programname)


    parts = title.split(' ')

    # trim date/time from end
    lastToKeep = 0
    for idx in range(0, len(parts)):
        # trace(9, 'idx=' + str(idx) + ': "' + parts[idx] + '"')
        if  (
                not re.match(r'\d+(:\d+)*', parts[idx]) # skip time like 12:24:00
                and parts[idx] != 'kl'
                and not common.is_swe_month(parts[idx])
                and not common.is_swe_weekday(parts[idx])
            ):
            #trace(9, 'idx=' + str(idx) + ' is to keep "' + parts[idx] + '"')                    
            lastToKeep = idx
        #trace(9, 'skipping idx=' + str(idx) + ' "' + parts[idx] + '" from title')

    if lastToKeep == 0:
        trace(4, 'didn\'t find any valid name-parts in title. Keeping as is: "', title, '"')
    else:
        title = ' '.join(parts[0:lastToKeep+1])

    title = common.unescape_html(title)
    title = title.replace('/', ' ').strip(' .,!')

    trace(4, 'new title is ' + title + '\nskipped index ', lastToKeep, 'to ', len(parts)-1)

    filename = programname + ' ' + displaydate + ' ' + title + '.m4a'
    trace(4, 'filename: ' + filename)
        
    return filename
Пример #2
0
def pp_info(url, nick):
    """ Return the transcript and link to the image of a Profound Programmer page. """
    try:
        data = common.read_url(url)
    except HTTPError:
        return '{}: kunde inte ladda sidan: {}'.format(nick, url)

    main_re = re.compile(r"""
        <li\ class="post\ photo">
        \s*
        <img\ src="(?P<img>.+?)"
        .+?
        <div\ class="caption"><p>
            \[(?P<transcript>.+?)\]
        </p>
        \s*
        (<p><a\ href="(?P<hdimg>.+?)">\[HD\ Version\]</a>)?
        """, re.DOTALL | re.VERBOSE)

    transcript_re = re.compile(r'text\s?:? (“(?P<title1>.+?)”|‘(?P<title2>.+?)’)?([,;] )?(?P<transcript>.+)', 
                               re.DOTALL)
    
    result = main_re.search(data)
    if not result:
        print(url)
        raise AttributeError('.profound could not match the regex! Has theprofoundprogrammer.org change format?')

    rawtranscript = transcript_re.match(common.unescape_html(sanitize(result.group('transcript'))))

    title = None
    if rawtranscript:
        for t in ('title1', 'title2'):
            if rawtranscript.group(t):
                title = rawtranscript.group(t)
        transcript = rawtranscript.group('transcript')
    else:
        transcript = common.unescape_html(result.group('transcript'))

    if result.group('hdimg'):
        image = result.group('hdimg')
    else:
        image = result.group('img')

    out = ['[{}]'.format(transcript)] + [image]
    if title:
        out = ['"{}"'.format(title)] + out

    return [common.truncate(x, 400) for x in out]
Пример #3
0
def xkcd_info(url, nick):
    """ Return the transcript and title of an xkcd page. """
    try:
        data = common.read_url(url)
    except HTTPError:
        return '{}: kunde inte ladda sidan: {}'.format(nick, url)

    title_re = re.compile(r'<title>xkcd: (.+?)</div>')
    titlebackup_re = re.compile(r'<div id="ctitle">(.+?)</div>')
    transcript_re = re.compile(r'<div id="transcript" .+?>(?P<transcript>.*?)(\{\{(?P<alt>.+?)\}\})?</div>', re.DOTALL)
    
    # Transcript
    result = transcript_re.search(data)
    transcript = [line.strip() for line in result.group('transcript').splitlines() 
                  if line.strip()]

    if not transcript:
        transcript = ['Ingen beskrivning än!']

    # Unused for now - also borken if no transcript is available
    # alttext = result.group('alt').strip()

    # Title
    title = title_re.search(data)
    if not title:
        title = titlebackup_re.search(data)

    firstline = '{} – {}'.format(title.group(1), url)

    return [common.truncate(common.unescape_html(x), 400) for x in [firstline] + transcript[:3]]
Пример #4
0
def sanitise(text):
    # nbsp ugly shit hack
    text = text.replace('\xa0', '')
    text = common.unescape_html(codecs.getdecoder('unicode_escape')(text)[0])
    # Note the special ⁄-char (not a regular slash) between </sup> and <sub>
    text = re.sub(r'<sup>(\d+)</sup>⁄<sub>(\d+)</sub>', r' \1/\2', text)
    return text.replace('<sup>', '^(').replace('</sup>',')').replace('( ', '(')
Пример #5
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == 'created_at':
             setattr(result, k, parse_search_datetime(v))
         elif k == 'source':
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Пример #6
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == 'created_at':
             setattr(result, k, parse_search_datetime(v))
         elif k == 'source':
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Пример #7
0
 def fixhtml(str):
   str = re.sub(r'(<br ?/>)+', r' ', str)
   str = re.sub('\n', r' ', str)
   str = re.sub('\r', r' ', str)
   str = re.sub(r'(<.+?>)+', r'', str)
   return common.unescape_html(str)