Пример #1
0
def markdownify(url_list, **options):
    articles = []
    images = []
    paragraph_links = options['paragraph_links']
    wrap_text = options['wrap_text']
    preamble = options['preamble']
    for url in url_list:
        req = urllib2.Request(url,None,{'Referer': url_list[0]})
        html = urllib2.urlopen(req).read()
        document = Document(html, url=url)
        readable_title = document.short_title()
        summary = document.summary()
        summary_doc = build_doc(summary)
        images.extend([a.get('src') for a in summary_doc.findall('.//img')])
        articles.append(document.summary())

    markdown_articles = []
    for (article, url) in zip(articles, url_list):
        h = html2text.HTML2Text(baseurl=url)
        h.inline_links = False
        h.links_each_paragraph = (paragraph_links and 1) or 0
        h.body_width = (wrap_text and 78) or 0
        markdown_articles.append(h.handle(article))
    combined_article = u"\n\n----\n\n".join(markdown_articles)
    if preamble:
        combined_article = (u"Title:        %s  \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
    return combined_article.encode("utf-8")
Пример #2
0
  def __init__(self, raw, ident):
    self.raw = raw 
    self.ident = ident 

    self.featval = {}

    self.html = build_doc(raw)
    self.doc = Document(raw)
    self.content = content(self.doc.summary())
    self.sents = sent_tokenize(self.content)
Пример #3
0
    def __init__(self, html_string):
        # use readability `build_doc` func to avoid encoding error
        self.html, _ = htmls.build_doc(html_string)
        self.title = self.get_title(self.html)
        # Use self.html when self.html.body does not exist
        try:
            self.body = cleaner.clean_html(self.html.body)
        except Exception as e:
            self.body = cleaner.clean_html(self.html)
            logger.warn(repr(e))

        self.prepend_newline()
Пример #4
0
def get_review_page(url, page):
    review_url = url + '/review_more?pageno=%s'%page
    html_source, _= curl(review_url)
    if html_source:
        html_source = re.sub('\<br\s*\>|\<br/\>', ' ', html_source)
        unicode_source = html_source.decode('utf-8', 'ignore')
        doc = build_doc(unicode_source)
        comment_list = doc.xpath("//div[@class='J_brief-cont']/text()")
        time_list = doc.xpath("//span[@class='time']/text()")
        rank_list = map(lambda x:rank_dict[x], doc.xpath("//div[@class='user-info']/span[1]/@title"))
        user_list = doc.xpath("//p[@class='name']/a[@class='J_card']/text()")
        return zip(user_list, rank_list, time_list, comment_list)
Пример #5
0
def get_reviews(xlsfile, url):
    start = datetime(2000, 7, 1)
    end = datetime(2013, 8, 16)
    review_url = url + '/review_more?pageno=1'
    html_source, _= curl(review_url)
    if html_source:
        unicode_source = html_source.decode('utf-8', 'ignore')
        doc = build_doc(unicode_source)
        name = doc.xpath("//div[@class='info-name']/h2/a/text()")
        account = doc.xpath("//span[@class='active']/em[1]/text()")
        account = re.search('\d+', str(account))
        account = int(account.group(0)) if account else 0
        pages = how_many_pages(account, 20)
        xlsfile.append(name)
        for page in range(1, pages+1):
            too_old = False
            lines = get_review_page(url, page)
            for line in lines:
                time = re.findall('\d+', line[2])
                length = len(time)
                if length == 2 or length >3:
                    month = int(time[0])
                    day  = int(time[1])
                    year = 2013
                elif length ==3:
                    year = int('20'+time[0])
                    month = int(time[1])
                    day = int(time[2])

                comment_date = datetime(year, month, day)
                if comment_date<end:
                    if comment_date<start:
                        too_old = True
                        break

                    line = list(line)
                    line[2] = comment_date.date()
                    line.insert(0, name[0])
                    print line
                    print time
                    print year, month, day

                    xlsfile.append(line)

            sleep(2)

            if too_old:
                break

    xlsfile.append([' '])
    xlsfile.append([' '])
Пример #6
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.url
     if base_href:
         # trying to guard against bad links like <a href="http://[http://...">
         try:
             # such support is added in lxml 3.3.0
             doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures='discard')
         except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures'
             # then we have lxml < 3.3.0
             # please upgrade to lxml >= 3.3.0 if you're failing here!
             doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
Пример #7
0
def split_page(domain, html_source, encode="utf-8"):
    doc = build_doc(html_source)
    if domain == 'mtime.com':
        sub_page_nodes = doc.xpath(".//div[@class='t_module']")
    elif domain == 'douban.com':
        sub_page_nodes = doc.xpath(".//div[@class='comment-item']")

    for sub_page_node in sub_page_nodes:
        html = u"""
            <html>
                <body>
                %s
                </body>
            </html>
        """ % tostring(sub_page_node).decode('utf-8')
        yield html.encode(encode)
Пример #8
0
def main():
    with open("stat.log", 'w') as f:
        for line in sys.stdin:
            url = line.rstrip('\n')
            html_source, _ = curl(url)  
            if not html_source:
                print "ERROR:", url
                continue
            unicode_source = html_source.decode('gbk')
            if u'你访问的贴子不存在' in unicode_source or u'你访问的贴子被隐藏' in unicode_source:
                print "ERROR:", url
                continue
            else:
                print url

            doc = build_doc(html_source)
            tie_ba = doc.xpath(XPATH)
            f.write(tie_ba[0] + '\n')
Пример #9
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.url
     if base_href:
         # trying to guard against bad links like <a href="http://[http://...">
         try:
             # such support is added in lxml 3.3.0
             doc.make_links_absolute(base_href,
                                     resolve_base_href=True,
                                     handle_failures='discard')
         except TypeError:  #make_links_absolute() got an unexpected keyword argument 'handle_failures'
             # then we have lxml < 3.3.0
             # please upgrade to lxml >= 3.3.0 if you're failing here!
             doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
Пример #10
0
def predefined_site(url, html):
    print(url, "=====================predefined_site?")
    pds = predefine_sites_collection.find()
    for pd in pds:
        if re.compile(pd['url_pattern'], re.I).search(url) is not None:
            print(url, "=====================predefined_site")
            doc = htmls.build_doc(html)
            for tag in doc.iter():
                allelem = doc.iter()
                for elem in allelem:
                    s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
                    if re.compile(pd['content_css']).search(s) is not None:
                        print(url, "=====================predefined_site", htmls.get_keywords(doc)+','+','.join(pd['tags']))
                        return Summary(get_clean_html(elem),
                           '',
                           short_title=htmls.shorten_title(doc),
                           title=htmls.get_title(doc),
                           description=htmls.get_description(doc),
                           keywords=htmls.get_keywords(doc)+','+','.join(pd['tags']))
    return None
Пример #11
0
def get_comments(url):
    xpaths = {
            '21cn.com': [("//td[@class='t_f']/text()",), ("//div[@class='pg']//a/@href", )],
            '55bbs.com': [("//div[@class='t_msgfont']/text()", ), ("//div[@class='pages']//a/@href", )],
            #'lirenn.55bbs.com': [("//dd[@class='clearfix']//p/text()", ), ],
            'lady8844.com': [("//td[@class='t_msgfont']/text()",), ("//div[@class='pages']//a/@href", )],
            'onlylady.com': [("//td[@class='t_f']/text()",), ("//div[@class='pg']//a/@href", )],
            'pclady.com.cn': [("//div[@class='replyBody']/text()", ), ("//div[@class='pager']//a/@href", )],
            'yoka.com': [("//td[@class='con_content']/text()", ), ("//dl[@class='bbs_Page']/dt//a/@href", )], # ajax 
        }

    domain = get_domain(url)
    parse_result = urlparse.urlparse(url)
    page_urls = []
    prefix = parse_result.scheme + '://' + parse_result.netloc
    if domain not in xpaths:
        return [], []
    if (domain == '55bbs.com' and 'liren' in url) or url in crawed_urls:
        return [], page_urls
    xpath = xpaths[domain]
    html_source, _ = curl(url)
    if not html_source:
        return [], []
    print domain, url
    crawed_urls.add(url)
    doc = build_doc(html_source)
    comments = doc.xpath(xpath[0][0])
    pages = doc.xpath(xpath[1][0])
    for page in pages:
        if page.startswith('/'):
            page = prefix + page
        if not page.startswith('http'):
            page = prefix + '/' + page

        if page not in crawed_urls:
            page_urls.append(page)

    return comments, page_urls
Пример #12
0
def predefined_site(url, html):
    print(url, "=====================predefined_site?")
    pds = predefine_sites_collection.find()
    for pd in pds:
        if re.compile(pd['url_pattern'], re.I).search(url) is not None:
            print(url, "=====================predefined_site")
            doc = htmls.build_doc(html)
            for tag in doc.iter():
                allelem = doc.iter()
                for elem in allelem:
                    s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
                    if re.compile(pd['content_css']).search(s) is not None:
                        print(
                            url, "=====================predefined_site",
                            htmls.get_keywords(doc) + ',' +
                            ','.join(pd['tags']))
                        return Summary(get_clean_html(elem),
                                       '',
                                       short_title=htmls.shorten_title(doc),
                                       title=htmls.get_title(doc),
                                       description=htmls.get_description(doc),
                                       keywords=htmls.get_keywords(doc) + ',' +
                                       ','.join(pd['tags']))
    return None
Пример #13
0
def get_comments(source_url):
    queue = deque() 
    comments = []
    domain = get_domain(source_url)
    crawled_url = set()
    if domain == 'mtime.com':
        url = source_url + '/comment.html'
    elif domain == 'douban.com':
        url = source_url + "/reviews"
    queue.append(url)
    comment_urls = set() 
    while len(queue):
        url = queue.popleft()
        if url in crawled_url:
            continue
        print url
        html_source, _ = curl(url)
        if not html_source:
            time.sleep(2*60)
            continue
        time.sleep(1.2)
        crawled_url.add(url)
        doc = build_doc(html_source)
        pages = doc.xpath(xpath[domain][0][1][0]) 
        pages = build_url(url, pages)
        queue.extend(pages)
        comment_urls |= set(doc.xpath(xpath[domain][0][0][0]))
    for comment_url in comment_urls:
        print comment_url
        comment_source, _ = curl(comment_url)
        if not comment_source:
            time.sleep(2*60)
            continue
        time.sleep(1.2)
        doc = build_doc(comment_source)
        title = u' '.join(doc.xpath(xpath[domain][0][2][0]))
        author = u" ".join(doc.xpath(xpath[domain][0][2][1]))
        created_on = u' '.join(doc.xpath(xpath[domain][0][2][2]))
        content = u' '.join(doc.xpath(xpath[domain][0][2][3]))
        #comments.append((title, author, created_on, content))
        yield (title, author, created_on, content)
    if domain == 'mtime.com':
        url = source_url + '/newshortcomment.html'
    elif domain == 'douban.com':
        url = source_url + "/comments"
    queue.append(url)
    while len(queue):
        url = queue.popleft()
        if url in crawled_url:
            continue
        html_source, _ = curl(url)
        if not html_source:
            time.sleep(2*60)
            queue.appendleft(url)
            continue
        print url
        time.sleep(1.2)
        crawled_url.add(url)
        doc = build_doc(html_source)
        pages = doc.xpath(xpath[domain][0][1][0]) 
        pages = build_url(url, pages)
        queue.extend(pages)
        for page_source in split_page(domain, html_source):
            doc = build_doc(page_source)
            author = u' '.join(doc.xpath(xpath[domain][1][2][0]))
            created_on = u' '.join(doc.xpath(xpath[domain][1][2][1]))
            content = u' '.join(doc.xpath(xpath[domain][1][2][2]))
            #comments.append((u"", author, created_on, content))
            yield (u"", author, created_on, content)
Пример #14
0
def main():
    start=datetime(2013, 06, 16)
    end=datetime(2013, 06, 17)
    f = open('stat-0616.csv', 'w') # 新产生的文件名, 最终发给顾问得
    f2 = codecs.open(u'中国最强音.csv', 'r', 'gbk') # 由query_from_pg.py生成的文件名
    writer = csv.writer(f)
    parsed_set =  {}
    for line in f2:
        row = line.split(',')
        print row[1], row[0]
        if 'tieba' not in row[0]:
            continue
        topic_url = get_url_with_qs(row[0])
        if topic_url not in parsed_set:
            html_source, _= curl(topic_url)
            if html_source:
                unicode_source = html_source.decode('gbk', 'ignore')
                if u'你访问的贴子不存在' in unicode_source or u'你访问的贴子被隐藏' in unicode_source:
                    print "ERROR:", row[0] 
                    continue
                doc = build_doc(unicode_source)
                title = doc.xpath("//h1//text()") 
                if "#" in row[0]:
                    title = u"[回复]" + title[0] if len(title)>0 else ''
                else:
                    title = title[0] if len(title)>0 else ''

                comment_num = doc.xpath("//span[@id='comment_num']/text()|(//li[@class='l_reply_num'])[3]/span/text()")
                comment_count=comment_num[0] if len(comment_num)>0 else 0

                at_xpath = "//div[contains(@class,'l_post')]//@data-field"
                regex = re.compile(u'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2}).*?(?P<hour>\d{1,2}).*?(?P<minute>\d{1,2})')
                at = doc.xpath(at_xpath)
                matched = regex.search(str(at))
                kwargs = {}
                if matched:
                    for key, value in matched.groupdict().items():
                        if value:
                            kwargs[key] = int(value)
                    at_str = datetime(**kwargs).strftime("%Y-%m-%d %H:%M:%S") 
                else:
                    at_str=''

                line = [topic_url, title[4:], at_str, comment_count]
                line = [column.encode('gbk') if isinstance(column, unicode) else column for column in line]
                #line=map(lambda x: re.sub(u'\s+|\xa0', ' ', x.decode('utf-8')).encode('gbk', 'ignore') if isinstance(x, basestring) else x, line)
                parsed_set[topic_url] = {'title': title, 'comment_count': comment_count} 
                
                at_time=datetime.strptime(at_str, "%Y-%m-%d %H:%M:%S")
                if start<= at_time < end:
                    writer.writerow(line)
            time.sleep(3) 

        if topic_url in parsed_set:
            title = parsed_set[topic_url]['title']
            comment_count = parsed_set[topic_url]['comment_count']
            line = [row[0], title, row[1], comment_count]
            line = [column.encode('utf-8') if isinstance(column, unicode) else column for column in line]

            at_time=datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
            if start<= at_time < end:
                writer.writerow(line)

    f.close()
    f2.close()
Пример #15
0
def content(html):
  doc = build_doc(html)
  text = [ clean_quotes(s.strip()) for s in doc.xpath('//text()') ]
  text = [ t.replace('\n',' ') for t in text if t ]
  text = ' '.join(text)
  return text