Exemplo n.º 1
0
Arquivo: wenku8.py Projeto: zqpm/ACG
 def download(self, unit=[]):
     if len(unit) == 0:
         unit = self.chps
     else:
         nunit = list()
         for idd in unit:
             nunit.append(self.chps[idd])
     unit = nunit
     for idx in unit:
         fname = idx[0] + '_' + self.fec + '.txt'
         if os.path.exists(fname):
             continue
         f = open(fname,'w')
         chplink = idx[1]
         print 'Downloading', idx[2], '>>> ', fname
         self.response = urllib2.urlopen(chplink)
         hj = self.rm_spchar(self.response.read())
         chpsp = bs(hj)
         chpsp = self.rm_spam(chpsp)
         # format output by wehku
         # get title/content
         chttl = chpsp.findAll('div',{'class':'chaptertitle'})
         chcnt = chpsp.findAll('div',{'class':'chaptercontent'})
         for vol in range(len(chttl) -1):
             atl = jtof(chttl[vol].text).encode(self.fec,'ignore').lstrip()
             f.write(atl)
             f.write('\n==========\n')
             f.write(jtof(chcnt[vol].text).encode(self.fec,'ignore'))
             f.write('\n\n\n\n')
         chpsp.close()
         f.close()
     return
Exemplo n.º 2
0
def cvt_codec(node):
    if hasattr( node.text, 'encode'):
        val = jtof(node.text.encode('utf-8'))
        # val = (node.text.encode('utf-8'))
    else:
        val = jtof(node.text)
    # print(val)‰
    # isUni(val)
    return val
Exemplo n.º 3
0
def fetch_data(word_lst):


    for word in word_lst:

        if len(word.split()) > 4 : # stop lookup a sentence
            continue

        req_word =str("_".join(word.split()))
        req_url =_REQ_HTTP_PREFIX+req_word
        print(req_url)

        results = requests.get(req_url,
              headers={'User-Agent': 'Mozilla/5.0'}, timeout=_REQ_TIMEOUTS)

        soup = BeautifulSoup(results.text)

        if not found_result(soup):
            continue

        look_up_word =soup.find('h1', {'id' :'word_name_h1'}).getText()
        output_result("\n[%s] " % look_up_word , False)

        soup_translate =soup.find('div', {'class' :'group_pos'})


        # Chinese meaning
        for lbl in soup_translate.findChildren('label'):
            output_result( jtof( lbl.getText().strip()) , False)
        output_result('')
        soup_sent = soup.findAll('dl',{'class':'vDef_list'})
        try:
            got_word = soup.find('dl',{'class':'vDef_list'}).find('span',{'class' :'text_blue'}).getText()
        except Exception as e:
            continue
        # example sentence
        for each_sent in soup_sent:
            # res_word="".join(word.split())
            replace_word=" ".join(got_word.split())
            sent = each_sent.find('dt').getText().replace(got_word," "+replace_word+" ")

            print each_sent.find('dt').firstText()
            print each_sent.find('dt').getText()

            output_result('\t'+sent)
            output_result('\t'+jtof(each_sent.find('dd').getText()))

    pass
Exemplo n.º 4
0
	def _jtof(self, text=''):
		try:
			text = jianfan.jtof(text)
			text = text.replace(u'“', u'「');
			text = text.replace(u'”', u'」');
		except Exception as e:
			pass
		return text
Exemplo n.º 5
0
    def __search__(self,print_results=False):
        '''
        returns list of results if successful or False otherwise
        '''
        results = []
        for page in range(0,self.pages):
            rsz = 8
            if self.rsz == RSZ_SMALL:
                rsz = 4
            args = {'q' : self.query,
                    'v' : '1.0',
                    'start' : page*rsz,
                    'rsz': self.rsz,
                    'safe' : self.safe, 
                    'filter' : self.filter,    
                    'hl'    : self.hl
                    }
            self.logger.debug('search: "%s" page# : %s'%(self.query, page))
            q = urllib.urlencode(args)
            search_results = urllib.urlopen(URL+q)
            data = json.loads(search_results.read())
            if not data.has_key('responseStatus'):
                self.logger.error('response does not have a responseStatus key')
                continue
            if data.get('responseStatus') != 200:
                self.logger.debug('responseStatus is not 200')
                self.logger.error('responseDetails : %s'%(data.get('responseDetails', None)))
                continue
            if print_results:
                if data.has_key('responseData') and data['responseData'].has_key('results'):
                    for result in  data['responseData']['results']:
                        if result:
							newTitle,bChanged = jtof('[%s]'%(urllib.unquote(result['titleNoFormatting'])))
							jianWarningStr=''							
							if bChanged :
								jianWarningStr = u'\u7c21\u9ad4\u7db2\u7ad9'							
							print clrTx(newTitle,'YELLOW')+clrTx(jianWarningStr,'GREY30')
							print jtof(result['content'].strip("<b>...</b>").replace("<b>",'').replace("</b>",'').replace("&#39;","'").strip())[0]
							print clrTx(urllib.unquote(result['unescapedUrl'])+'\n','GREY30')
                else:
                    # no responseData key was found in 'data' 
                    self.logger.error('no responseData key found in response. very unusal')
            results.append(data)
        return results
def main():
	FileNamePathSet = scanAndProduceStack(unicode(tPath))
	#use pop
	while len(FileNamePathSet) != 0:
		oldPath = FileNamePathSet.pop()
		newPath = jtof(oldPath)
		#print u"o:(%s) n:(%s)" %(oldPath,newPath)
		if newPath != oldPath:
			process = Popen(prepareRenamePara(oldPath,newPath))
			process.wait()
Exemplo n.º 7
0
def convert_file(target_file):
    if os.path.isfile(target_file):
        user_dic = {}
        f_encoding = get_encoding(target_file)
        print("正在轉換", target_file, " 編碼為: ", f_encoding)
        if f_encoding is None:
            print("抱歉, 未能正確判斷編碼!\n\n")
        else:
            if os.path.getsize(target_file) > 0:
                if backup:
                    # do backup
                    backup_file = target_file + '.bak'
                    shutil.copy2(target_file, backup_file)
                result_content = ''
                fp = open(target_file, 'r', encoding=f_encoding)
                original_content = fp.read()
                fp.close()

                path_dir = os.path.dirname(os.path.abspath(target_file))
                user_dic_pathname = path_dir + os.path.sep + user_dic_file

                if os.path.exists(user_dic_pathname):
                    user_dic = get_dictionary(user_dic_pathname)

                if convert_type == "none" or convert_type == "utf8":
                    new_content = original_content
                else:
                    new_content = jtof(original_content)

                origlines = new_content.splitlines(True)
                if use_bom:
                    fpw = open(target_file, 'w', encoding='utf-8-sig')
                else:
                    fpw = open(target_file, 'w', encoding='utf-8')

                for line in origlines:
                    if convert_type == "g2bdic":
                        newline = convert_vocabulary(line, dic_tw)
                        if use_user_dic:
                            newline = convert_vocabulary(newline, user_dic)
                        fpw.write(newline)
                    else:
                        fpw.write(line)
                # fpw.write(new_content.encode('UTF-8'))
                fpw.close()

                print(MSG_CONVERT_FINISH)
            else:
                print(MSG_NO_CONVERT)
    else:
        print("File not found! " + target_file + " 檔案不存在! ")
Exemplo n.º 8
0
def convert_file(target_file):
    if os.path.isfile(target_file):
        user_dic = {}
        f_encoding = get_encoding(target_file)
        print("正在轉換", target_file, " 編碼為: ", f_encoding)
        if f_encoding is None:
            print("抱歉, 未能正確判斷編碼!\n\n")
        else:
            if os.path.getsize(target_file) > 0:
                if backup:
                    # do backup
                    backup_file = target_file + '.bak'
                    shutil.copy2(target_file, backup_file)
                result_content = ''
                fp = open(target_file, 'r', encoding=f_encoding)
                original_content = fp.read()
                fp.close()

                path_dir = os.path.dirname(os.path.abspath(target_file))
                user_dic_pathname = path_dir + os.path.sep + user_dic_file

                if os.path.exists(user_dic_pathname):
                    user_dic = get_dictionary(user_dic_pathname)

                if convert_type == "none" or convert_type == "utf8":
                    new_content = original_content
                else:
                    new_content = jtof(original_content)

                origlines = new_content.splitlines(True)
                if use_bom:
                    fpw = open(target_file, 'w', encoding='utf-8-sig')
                else:
                    fpw = open(target_file, 'w', encoding='utf-8')

                for line in origlines:
                    if convert_type == "g2bdic":
                        newline = convert_vocabulary(line, dic_tw)
                        if use_user_dic:
                            newline = convert_vocabulary(newline, user_dic)
                        fpw.write(newline)
                    else:
                        fpw.write(line)
                # fpw.write(new_content.encode('UTF-8'))
                fpw.close()

                print(MSG_CONVERT_FINISH)
            else:
                print(MSG_NO_CONVERT)
    else:
        print("File not found! " + target_file + " 檔案不存在! ")
Exemplo n.º 9
0
def renamer(target):
    filter = ['mp3'] #'pdf', 'zip', 'chm', 'rar', 'djvu', 'epub'
    os.chdir(target)
    for entry in os.listdir('.'):
        if os.path.isdir(entry):
            backto = os.getcwd()
            renamer(entry)
            os.chdir(backto) 
        else:
            filename = entry
            list = filename.rsplit(".", 1)
            if len(list) > 1 and list[-1] in filter:
                newfilename = jtof(list[0]) + "." + list[-1]
                os.rename(filename, newfilename)
Exemplo n.º 10
0
def convertFile(target_file):
    user_dic = {}
    if os.path.exists(target_file):
        f_encoding = getEncoding(target_file)
        print u"正在轉換", target_file, u" 編碼為: ", f_encoding
        if f_encoding == None:
            print(u"抱歉, 未能正確判斷編碼!\n\n")
        else:
            result_content = u''
            original_content = u''
            fp = open(target_file, 'r')
            original_content = fp.read()
            fp.close()

            if original_content.startswith(codecs.BOM_UTF8):
                original_content = original_content.lstrip(codecs.BOM_UTF8)

            utf8content = original_content.decode(f_encoding)

            newcontent = jtof(utf8content)
            lines = newcontent.splitlines()
            for line in lines:
                line = convertVocabulary(line, dic_tw())

            if os.path.getsize(target_file) > 0:
                # do backup
                backup_file = target_file + '.bak'
                shutil.copy2(target_file, backup_file)
                fpw = open(target_file, 'w')
                if not newcontent.startswith(codecs.BOM_UTF8.decode("utf8")):
                    fpw.write(codecs.BOM_UTF8)

                pathdir = os.path.dirname(os.path.abspath(target_file))
                user_dic_pathname = pathdir + os.path.sep + user_dic_file

                if os.path.exists(user_dic_pathname):
                    user_dic = getUserDic(user_dic_pathname)
                    if len(user_dic) > 0:
                        for line in lines:
                            line = convertVocabulary(line, user_dic)

                for line in lines:
                    line = convertVocabulary(line, user_dic)
                    fpw.write(line.encode('UTF-8'))
                    fpw.write("\n")
                fpw.close()

                print(MSG_CONVERT_FINISH)
            else:
                print MSG_NO_CONVERT
Exemplo n.º 11
0
def convertFile(target_file):
	user_dic= {}
	if os.path.exists(target_file):
		f_encoding = getEncoding(target_file)
		print u"正在轉換", target_file, u" 編碼為: ", f_encoding
		if f_encoding == None:
			print (u"抱歉, 未能正確判斷編碼!\n\n");
		else:
			result_content = u''
			original_content = u''
			fp = open(target_file, 'r')
			original_content = fp.read()
			fp.close()
			
			if original_content.startswith( codecs.BOM_UTF8 ):
				original_content = original_content.lstrip( codecs.BOM_UTF8);	
			
			utf8content=original_content.decode(f_encoding)

			newcontent = jtof(utf8content)
			lines = newcontent.splitlines();
			for line in lines:
				line = convertVocabulary(line, dic_tw());
				
			if os.path.getsize(target_file) > 0:
				# do backup
				backup_file = target_file + '.bak'
				shutil.copy2(target_file, backup_file)
				fpw = open(target_file, 'w')
				if not newcontent.startswith(codecs.BOM_UTF8.decode( "utf8" )):
					fpw.write(codecs.BOM_UTF8)

				pathdir =os.path.dirname(os.path.abspath(target_file));
				user_dic_pathname = pathdir +os.path.sep+user_dic_file;
				
				if os.path.exists(user_dic_pathname):
					user_dic = getUserDic(user_dic_pathname);
					if len(user_dic) > 0:
						for line in lines:
							line = convertVocabulary(line,  user_dic);

				for line in lines:
					line = convertVocabulary(line,  user_dic);
					fpw.write(line.encode('UTF-8'))
					fpw.write("\n");
				fpw.close();
				
				print (MSG_CONVERT_FINISH)
			else:
				print MSG_NO_CONVERT
Exemplo n.º 12
0
def convertFile(target_file):
    if os.path.isfile(target_file):
        f_encoding = getEncoding(target_file)
        print u"正在轉換", target_file, u" 編碼為: ", f_encoding
        if f_encoding is None:
            print(u"抱歉, 未能正確判斷編碼!\n\n")
        else:
            if os.path.getsize(target_file) > 0:
                if backup:
                    # do backup
                    backup_file = target_file + '.bak'
                    shutil.copy2(target_file, backup_file)

                result_content = u''
                original_content = u''
                fp = open(target_file, 'r')
                original_content = fp.read()
                fp.close()

                if original_content.startswith(codecs.BOM_UTF8):
                    original_content.lstrip(codecs.BOM_UTF8)

                utf8content = original_content.decode(f_encoding, 'ignore')
                if convertType != "none":
                    newcontent = jtof(utf8content)
                else:
                    newcontent = utf8content

                origlines = newcontent.splitlines(True)
                fpw = open(target_file, 'w')
                if (use_bom):
                    if not newcontent.startswith(
                            codecs.BOM_UTF8.decode("utf8")):
                        fpw.write(codecs.BOM_UTF8)
                for line in origlines:
                    if convertType == "g2bdic":
                        fpw.write(
                            convertVocabulary(line, dic_tw()).encode('UTF-8'))
                    else:
                        fpw.write(line.encode('UTF-8'))
                #fpw.write(newcontent.encode('UTF-8'))
                fpw.close()

                print(MSG_CONVERT_FINISH)
            else:
                print MSG_NO_CONVERT
    else:
        print "File not found! " + target_file + " 檔案不存在! "
Exemplo n.º 13
0
def convertFile(target_file):
    if os.path.isfile(target_file):
        f_encoding = getEncoding(target_file)
        print u"正在轉換", target_file, u" 編碼為: ", f_encoding
        if f_encoding is None:
            print (u"抱歉, 未能正確判斷編碼!\n\n");
        else:
            if os.path.getsize(target_file) > 0:
                if backup:
                    # do backup
                    filename, file_extension = os.path.splitext(target_file)
                    backup_file = filename + '.bak' + file_extension
                    shutil.copy2(target_file, backup_file)

                result_content = u''
                original_content = u''
                fp = open(target_file, 'r')
                original_content = fp.read()
                fp.close()

                if original_content.startswith(codecs.BOM_UTF8):
                    original_content.lstrip(codecs.BOM_UTF8)

                utf8content = original_content.decode(f_encoding, 'ignore')
                if convertType != "none":
                    newcontent = jtof(utf8content)
                else:
                    newcontent = utf8content

                origlines = newcontent.splitlines(True)
                fpw = open(target_file, 'w')
                if (use_bom):
                    if not newcontent.startswith(codecs.BOM_UTF8.decode("utf8")):
                        fpw.write(codecs.BOM_UTF8)
                for line in origlines:
                    if convertType == "g2bdic":
                        fpw.write(convertVocabulary(line, dic_tw()).encode('UTF-8'))
                    else:
                        fpw.write(line.encode('UTF-8'))
                # fpw.write(newcontent.encode('UTF-8'))
                fpw.close()

                print (MSG_CONVERT_FINISH)
            else:
                print MSG_NO_CONVERT
    else:
        print "File not found! " + target_file + " 檔案不存在! "
Exemplo n.º 14
0
def getWikiContent(queryStr):
	result = ''
	#DB.debug('debug print')
	#DB.error('error!')
	wikipedia.set_lang(LANG)
	try:
		q = wikipedia.page(queryStr)
		content = ''
		if LANG == 'zh' :
			content, bChanged = jtof(q.content)
		else:
			content = q.content
		result = content
		DB.debug(result)
	except wikipedia.exceptions.DisambiguationError as e :		
		DB.error(e)
	except wikipedia.exceptions.PageError as e:	
		DB.error(e)
	return result
Exemplo n.º 15
0
def download_subtitles_in_dir(dir_name, timeout):
    try:
        file_list = os.listdir(dir_name.decode('utf-8'))
        for video_filename in file_list:
            # print "ext:", splitext(video_filename)[1]
            # print video_filename
            if splitext(video_filename)[1] in [".mkv",".avi"]:
                if os.path.isfile(dir_name+splitext(video_filename)[0]+".ass") or os.path.isfile(dir_name+splitext(video_filename)[0]+".srt") \
                or os.path.isfile(dir_name+splitext(video_filename)[0]+".ssa") or os.path.isfile(dir_name+splitext(video_filename)[0]+".sub"):
                    print "subtitle exist already, skip"
                else:
                    fan_video_filename = jtof(video_filename)
                    
                    # video_filename is simplified chinese, rename the file as tranditional pne 
                    if fan_video_filename != video_filename:
                        os.rename(dir_name+video_filename, dir_name+fan_video_filename)
                        download_subtitle((dir_name+fan_video_filename).encode('big5'), timeout)
                    else:
                        download_subtitle((dir_name+video_filename).encode('big5'), timeout) 
    except IOError:
        print "failed to open directory", dir_name, 
Exemplo n.º 16
0
def GetDBInfo(HtmlContent,WooyunNumber):
  DBInfoList = []
  ParserTarget = pq(HtmlContent)
  Content = ParserTarget('h3').map(lambda i,e: pq(e).text()) #抓到所有H3內的標籤

  for i,x in enumerate(Content):
    NoSpacestr =  x.encode('utf8').replace('\n', '').replace('\t', '') #替換多餘空格
    TmpList = NoSpacestr.split(':')#陣列[1]即為所要的內容
    if i == 0:#特別為漏洞編號設的,因為從網頁內抓會少個0,直接用檔案名稱就OK
      DBInfoList.append(WooyunNumber)
      continue
    Big5Str = jtof(TmpList[1])#簡體轉繁體
    #詞語轉換!,否則注入->註入!!
    Big5Str = ConvertWords.Convert(Big5Str)
    #DbgPrint('Big5Str:%s' % Big5Str.encode('utf8') )
    DBInfoList.append(Big5Str) #存繁體!!
  
  #抓取是否為精華文章的判斷()
  
  

  return DBInfoList
Exemplo n.º 17
0
def search(original_query, age_check=None):
    query = jianfan.jtof(original_query)
    logging.debug('Requesting Plurk for query <{0}>'.format(repr(query)))
    offset = 0
    uri = u'/APP/PlurkSearch/search?query={0}'.format(query)
    for page in range(25):
        try:
            response = get_client().callAPI(u'{0}&offset={1}'.format(uri,offset or ''))
        except ValueError as error:
            logging.info(u'Received Plurk error <{0}> suggesting no results'.format(error.message))
            break
        if not response: break
        users = response.get('users') or {}
        for plurk in response.get('plurks',[]):
            if query.lower() not in plurk.get('content_raw').lower(): continue
            if age_check and age_check(plurk.get('posted')): raise StopIteration
            user_id = plurk.get('user_id')
            user = get_user_details(user_id) or users.get(user_id) or {}
            if user.get('default_lang') not in ('cn','tr_ch'): continue
            yield get_tweet(plurk, user)
        if not response.get('has_more'): break
        offset = response.get('last_offset')
Exemplo n.º 18
0
Arquivo: template.py Projeto: zqpm/ACG
    def download(self):
        if len(unit) == 0:
            unit = self.chps
        else:
            nunit = list()
            for idd in unit:
                nunit.append(self.chps[idd])
        unit = nunit
        for idx in unit:
            fname = idx[0] + '_' + self.fec + '.txt'

            if os.path.exists(fname):
                continue
            print 'Downloading', idx[2], '>>> ', fname
            f             = open(fname,'w')
            chplink       = self.SRC_URL + idx[1]
            self.response = urllib2.urlopen(chplink)
            processed     = self.dosomething(self.response.read())
            chpsp         = bs(processed)
            # format output by sfacg
            f.write(jtof(chpsp.body.text).encode(self.fec,"ignore"))
            f.close()
        return
Exemplo n.º 19
0
def segmentToListPerQuery(queryString):
    listPerQuery = []
    segedList = []
    out1 = re.sub("[a-zA-Z]+", "", queryString)
    out1 = re.sub("[%s]" % re.escape(string.punctuation), "", out1)
    # segString = pseg.cut(queryString.decode("utf-8"))
    dd = jianfan.ftoj(out1).encode("utf-8")
    segString = pseg.cut(dd)
    # segString = pseg.cut(queryString.decode("utf-8"))
    # segString = jieba.cut(queryString,cut_all=False)
    # print ".. ".join(segString)
    # for i in segString:
    # 	listPerQuery.append(i)

    for z in segString:
        # print z.word + "\n"
        # if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m":
        if z.flag != "x":
            # segedList.append(z.word.encode("utf-8"))
            dd = jianfan.jtof(z.word).encode("utf-8")
            # segedList.append(dd)
            segedList.append(z.word)
    return segedList
Exemplo n.º 20
0
def segmentToListPerQuery(queryString):
    listPerQuery = []
    segedList = []
    out1 = re.sub('[a-zA-Z]+', '', queryString)
    out1 = re.sub('[%s]' % re.escape(string.punctuation), '', out1)
    #segString = pseg.cut(queryString.decode("utf-8"))
    dd = jianfan.ftoj(out1).encode("utf-8")
    segString = pseg.cut(dd)
    #segString = pseg.cut(queryString.decode("utf-8"))
    #segString = jieba.cut(queryString,cut_all=False)
    #print ".. ".join(segString)
    #for i in segString:
    #	listPerQuery.append(i)

    for z in segString:
        #print z.word + "\n"
        #if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m":
        if z.flag != "x":
            #segedList.append(z.word.encode("utf-8"))
            dd = jianfan.jtof(z.word).encode("utf-8")
            #segedList.append(dd)
            segedList.append(z.word)
    return segedList
Exemplo n.º 21
0
def translate_word(word):


    dflt_params = conn2Site("dict-co.iciba.com","/api/dictionary.php?w=",word)


    root = etree.fromstring("%s" % dflt_params,
                            parser=etree.XMLParser(recover=True))
    if type(root) == type(None):
        return -1

    if len(root.getchildren()) == 1 :

        dflt_params = conn2Site("dict.qq.com","/dict?q=",word)

        json_data = json.loads(dflt_params)

        if 'err' in json_data:
            return
        else:
            if 'netdes' in json_data:

                for ln in json_data['netdes']:

                    for sub_ln in ln['des']:

                        try:
                            if "mf" in sub_ln:
                                # print("mf")
                                print str(sub_ln['mf']+" "+jtof(sub_ln['d'])).replace('&quot;','')
                            else:
                                    print(sub_ln['d'])
                        except :
                            pass

            if 'netsen' in json_data:
                for ln in json_data['netsen']:
                    continue
                    print ln['cs']
                    print re.sub('<[^<]+?>', '', ln['es'])#escape html tag
                    # for k,v in ln.iteritems():
                    #     print k
                    # print unicode(ln,'utf-8')


        return

    child = root.getchildren()
    for node in child:
        tag = node.tag
        val =  cvt_codec(node)

        if "key"==tag:
            print("")
            if len(val.split()) >=2 :
                stdout.write( "句子/片語:"+str(val)+"\r\n")
            elif len(val.split()) == 1 :
                stdout.write( "單字:"+str(val)+"\r\n")
            else :
                print("Error")

        if "pos"==tag:
            stdout.write( "(%s)"%val)
        if "acceptation"==tag:
            stdout.write(val)
        if "sent" == tag:
            got_sentence(node)
Exemplo n.º 22
0
mainpage = "http://www.3d66.com/model.html"

driver.get(mainpage)

# Main page
sub_types =[]
main_types = driver.find_elements_by_xpath("//div[@class='class_para']")
#info_file = "f:/3dbb_links.txt"
#file_object = open(info_file, "w")
for main_type in main_types:
    main_type_name = main_type.find_element_by_xpath("./p/a")
    sub_type_names = main_type.find_elements_by_xpath("./span/a")
    for sub_type_name in sub_type_names:
        sub_type_link = sub_type_name.get_attribute('href')
        mt_name = jianfan.jtof(main_type_name.text).replace(u'3d模型','')
        st_name = jianfan.jtof(sub_type_name.text)
        sub_types.append([mt_name, st_name, sub_type_link])        
        #info_content = (mt_name + "," + st_name + "," + sub_type_link + "\n").encode('utf-8')
        #file_object.write(info_content)
#file_object.close()

saved = sub_types
'''
x = 0
for save in saved:
    x = x + 1
    print save[1], x
'''
# main type, sub type, link
Exemplo n.º 23
0
import os
import sys
sys.path.append('j2f')
import charsets
from jianfan import jtof

fp_1 = open('ESOcast72_s.srt','r')
fc_1 = fp_1.read()
fp_1.close()

fc_c = jtof(fc_1)

fc_2 = ""

# for cc_c in fc_c:
#     fc_2 += unicode(cc_c).encode('utf-8')

fc_2 = fc_c

fn_2 = 'ESOcast72_t.srt'
fp_2 = open(fn_2,'w')
# print >> fp_2, fc_2
fp_2.write(fc_2)
fp_2.close()

Exemplo n.º 24
0
        name = name.string

        for character in name:
            if (u'\u4e00' <= character <= u'\u9fff'):
                find = 1
                break

        if find == 1:
            for character in name:
                if (u'\u3041' <= character <= u'\u309C'
                        or u'\uAC00' <= character <= u'\uCB4C'):
                    find = 0
                    break

            name = name.encode('utf8')
            name_cht = jianfan.jtof(name)

        if find == 0:
            name_cht = 'No cht name'

    player_min = soup.find('minplayers').string
    player_max = soup.find('maxplayers').string
    game_time = soup.find('maxplaytime').string
    publish_year = soup.find('yearpublished').string

    if soup.find('image') != None:
        imgsrc = soup.find('image').string
        imgsrc = imgsrc.split('/')[4]
    else:
        imgsrc = 0
Exemplo n.º 25
0
def htmlParser(tPage):
   opener = urllib2.build_opener()
   opener.addheaders = [('User-agent','Mozilla/5.0')]
   resp = opener.open(tPage)
   if resp.code == 200 :
      data = resp.read()
      resp.close()
   elif resp.code == 404 :
      print "page do not exist"
      exit()
   elif resp.code == 403 :
   	  print "Forbidden!"
   	  exit()
   else :
      print "can not open page"
      exit()
   parser = etree.HTMLParser()
   tree = etree.parse(StringIO(data), parser)
   #etree.strip_tags(tree,'a')
   #etree.strip_tags(tree,'dt')
   #etree.strip_tags(tree,'dd')
   #etree.strip_tags(tree,'dl')
   
   result = etree.tostring(tree.getroot(), pretty_print=True, method="html", encoding='UTF-8')
   #DB(1, result)

   encoding = chardet.detect(result)
   #print encoding

   targetURL = ""
   lineSum = 0

   os.system('clear')

   resultSet = []

   #|class|howmany|
   arrangedSetA = []

   classSet = re.findall('class="allpage fs16">([^<]+)<',result)
   for className in classSet :
   	   if className is not None:   	   	   
   	   	   #print className
   	   	   arrangedSetA.append([className,0])

   howmanySet = re.findall('class="schnum">([^<]+)<',result)
   iCnt = 0
   for howmany in howmanySet :
   	   if howmany is not None:
   	   	   #print howmany
   	   	   numbers = re.findall('([0-9])',howmany)
   	   	   #for number in numbers:
   	   	   	   #print number
   	   	   arrangedSetA[iCnt][1] = numbers[1]
   	   iCnt += 1

   #print arrangedSetA

   #dtSet = tree.xpath("//dl[@class='allList']")
   #print len(dtSet)

   #|title|href|
   arrangedSetB = []

   aSet = tree.xpath("//dl[@class='allList']//dt/a")
   for e in aSet:
   	   if e.text is not None:
   	   	   arrangedSetB.append([e.text,e.get('href')])   	   	   
   	   	   DB(1, e.text+"|"+e.get('href'))


   #|explaination|
   arrangedSetC = []
   ddSet = tree.xpath("//dl[@class='allList']//dd")
   for e in ddSet:
   	   if e.text is not None:
   	   	   arrangedSetC.append(e.text)
   	   	   #print e.text+'\n'

   accumulation = 0
   for e in arrangedSetA:
		bSChineseWarning = False
		if e[0] == '\xe6\x97\xa5\xe4\xb8\xad\xe8\xbe\x9e\xe6\x9b\xb8' or e[0] == '\xe4\xb8\xad\xe6\x97\xa5\xe8\xbe\x9e\xe6\x9b\xb8':
			bSChineseWarning = True		
		print clrTx(e[0]+'\n','BLUE')
		for idx in range(int(e[1])):
			print clrTx(arrangedSetB[accumulation][0],'YELLOW')+clrTx(' > input('+str(accumulation)+')for more detail\n','GREY30')
			if bSChineseWarning == True:				
				print jianfan.jtof(arrangedSetC[accumulation])[0]
			else :
				print ripSentence(arrangedSetC[accumulation])
				#print tokenizedRomaji(ripSentence(arrangedSetC[accumulation]))
			accumulation+=1
		num = raw_input()
		iIn = parseInt(num)
		#[]==user can input detail url futher by any pause
		if iIn is not None:
			#print 'go %s'%(aSet[1].get('href'))			
			process = Popen(['python',INSFOLDER+'/gooDetailA.py','http://dictionary.goo.ne.jp'+aSet[iIn].get('href'),INSFOLDER,'1'])
			process.wait()
			break #for console user experience
					
   #myList = tree.xpath("//div[@class='allResultList']")
   #resultSet = handler(myList)
   return resultSet
Exemplo n.º 26
0
#-*- encoding:utf-8 -*-
from sys import argv
from jianfan import jtof, ftoj

f = open(argv[1])
s = (("".join([w.strip() for w in f.readlines()])).decode('utf-8')).split(" ")

f2 = open(argv[2])
lines = f2.readlines()

print "result:"
i = 0
for line in lines:
  if line[0] != " " and line[0] != "<":
    line2 = line.strip() 
    if len(line2) > 1:
      result = s[i]+jtof(line2.decode('utf-8')[1:].strip())
      print result.replace(" ","")
      i+=1