Exemplo n.º 1
0
 def _get_title(_url):  
     HEADER = {'Accept-Language':'en-US,en;q=0.5'}
     request = Request(_url, headers=HEADER)
     data = urlopen(request)
     htree=etree.parse(data, hparser)
     raw_title = htree.find(".//title").text        
       
     code = get_host_code(_url)
     title = title_cleaners[code](raw_title)
     title = re.sub('[\|\*\[\]\(\)~\\\]','',title)
     return title
Exemplo n.º 2
0
    def _get_title(_url):
        HEADER = {'Accept-Language': 'en-US,en;q=0.5'}
        request = Request(_url, headers=HEADER)
        data = urlopen(request)
        htree = etree.parse(data, hparser)
        raw_title = htree.find(".//title").text

        code = get_host_code(_url)
        title = title_cleaners[code](raw_title)
        title = re.sub('[\|\*\[\]\(\)~\\\]', '', title)
        return title
Exemplo n.º 3
0
def get_video_links_from_html(text):
    """
    Strips video link from a string in html format
    by looking for the href attribute.
    """
    # could also just use BeautifulSoup, but this regex works fine
    link_pat   = re.compile('href="(.*?)"') 
    links = link_pat.findall(text)
    video_links = []
    for l in links:
        code = get_host_code(l)
        if code:
            clean = link_cleaners[code]
            if clean:
                link = clean(fix_html_entities(l))
                if link:
                    video_links.append(link)                
    return video_links
Exemplo n.º 4
0
def get_video_links_from_html(text):
    """
    Strips video link from a string in html format
    by looking for the href attribute.
    """
    # could also just use BeautifulSoup, but this regex works fine
    link_pat = re.compile('href="(.*?)"')
    links = link_pat.findall(text)
    video_links = []
    for l in links:
        code = get_host_code(l)
        if code:
            clean = link_cleaners[code]
            if clean:
                link = clean(fix_html_entities(l))
                if link:
                    video_links.append(link)
    return video_links