Пример #1
0
def execute_from_command_line():
    yahoo_dir = 'J:\\yahoo_data\\'
    yahoo_txt_file = open('J:\\yahoo_data\\all.txt', 'w')
    
    start = 1
    while 1:
        filename = yahoo_dir + str(start) + '.html'
        if os.path.isfile(filename):
            
            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            fp.close()
            
            #codedetect = chardet.detect(htmltxt)["encoding"] #检测得到编码方式
            #print codedetect
	    #htmltxt = unicode(htmltxt, codedetect).encode('utf-8')
	
            #target_filename = yahoo_txt_dir + str(start) + '.txt'
            yahoo_txt_file.write(html2txt(htmltxt))
            print 'Success change html to txt'
            
            start += 1
        else:
            break
    yahoo_txt_file.close()
Пример #2
0
def html_to_txt():
    """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii
    """
    ft = open(yahoo_txt, 'w')
    start = 1
    while 1:
        filename = yahoo_dir + str(start) + '.html'
        if os.path.isfile(filename):
            
            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            if not htmltxt or not len(htmltxt):
                continue
            fp.close()
            
            codedetect = chardet.detect(htmltxt)["encoding"]				#检测得到修改之前的编码方式
            print codedetect
	    if not codedetect in ['utf-8', 'ascii']:
	        htmltxt = unicode(htmltxt, codedetect).encode('utf-8')
	        codedetect = chardet.detect(htmltxt)["encoding"]				#检测得到修改之后的编码方式
                print 'change', codedetect
            
            #target_filename = yahoo_txt_dir + str(start) + '.txt'
            #ft = open(target_filename, 'w')
            ft.write(html2txt(htmltxt))
            
            print 'Success change html to txt %s' % start
            
            start += 1
        else:
            break
    ft.close()
Пример #3
0
def html_to_txt():
    """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii
    """
    ft = open(yahoo_txt, 'w')
    start = 1
    while 1:
        filename = yahoo_dir + str(start) + '.html'
        if os.path.isfile(filename):

            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            if not htmltxt or not len(htmltxt):
                continue
            fp.close()

            codedetect = chardet.detect(htmltxt)["encoding"]  #检测得到修改之前的编码方式
            print codedetect
            if not codedetect in ['utf-8', 'ascii']:
                htmltxt = unicode(htmltxt, codedetect).encode('utf-8')
                codedetect = chardet.detect(htmltxt)[
                    "encoding"]  #检测得到修改之后的编码方式
                print 'change', codedetect

            #target_filename = yahoo_txt_dir + str(start) + '.txt'
            #ft = open(target_filename, 'w')
            ft.write(html2txt(htmltxt))

            print 'Success change html to txt %s' % start

            start += 1
        else:
            break
    ft.close()
Пример #4
0
def execute_from_command_line():
    yahoo_dir = 'J:\\yahoo_data\\'
    yahoo_txt_file = open('J:\\yahoo_data\\all.txt', 'w')

    start = 1
    while 1:
        filename = yahoo_dir + str(start) + '.html'
        if os.path.isfile(filename):

            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            fp.close()

            #codedetect = chardet.detect(htmltxt)["encoding"] #检测得到编码方式
            #print codedetect
            #htmltxt = unicode(htmltxt, codedetect).encode('utf-8')

            #target_filename = yahoo_txt_dir + str(start) + '.txt'
            yahoo_txt_file.write(html2txt(htmltxt))
            print 'Success change html to txt'

            start += 1
        else:
            break
    yahoo_txt_file.close()