def text_blob(doc): """ TextBlob Sentiment (by segment) :param doc: zh doc :return: (-1~1) """ try: translator_en = Translator() en_text = translator_en.translate(doc).text blob = TextBlob(en_text) except: try: blob = TextBlob(doc) blob = blob.translate() except: try: en_text = iciba(doc, dst='en') blob = TextBlob(en_text) except: try: gs = goslate.Goslate() en_text = gs.translate(doc, 'en') blob = TextBlob(en_text) except: translator = tT(to_lang="en") en_text = translator.translate(doc) blob = TextBlob(en_text) score = [] for sentence in blob.sentences: emotion = sentence.sentiment.polarity * sentence.sentiment.subjectivity if 0.00001 > emotion > -0.00001: continue score.append(emotion) return np.mean(score)
def get_html_text(url): ''' 通过URL获取页面内容 并提取其中的文本信息 :param url: 目标url :return: 提取得到的文本信息 ''' #下载页面内容 try: req=requests.get(url,headers=headers,timeout=3) req.encoding = req.apparent_encoding html=req.text #print(html) soup = BeautifulSoup(html, 'lxml') #去除script与style [s.extract() for s in soup(['script','style'])] #提取文本 拼接为一串 text=soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) print(text) # tansor=Translator() # # print(tansor.detect(text).lang) # if tansor.detect(text).lang=='en': # text=tansor.translate(text,dest="zh-cn").text # print(text) try: print(detect(text)) if detect(text) != 'zh-cn': text = iciba(text, dst='zh') print(text) except: pass return text except: return None
def run_inference_on_image(image): """Runs inference on an image. Args: image: Image file name. Returns: Nothing """ if not tf.gfile.Exists(image): tf.logging.fatal('File does not exist %s', image) image_data = tf.gfile.FastGFile(image, 'rb').read() # Creates graph from saved GraphDef. create_graph() with tf.Session() as sess: # Some useful tensors: # 'softmax:0': A tensor containing the normalized prediction across # 1000 labels. # 'pool_3:0': A tensor containing the next-to-last layer containing 2048 # float description of the image. # 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG # encoding of the image. # Runs the softmax tensor by feeding the image_data as input to the graph. softmax_tensor = sess.graph.get_tensor_by_name('softmax:0') predictions = sess.run(softmax_tensor, {'DecodeJpeg/contents:0': image_data}) predictions = np.squeeze(predictions) # Creates node ID --> English string lookup. node_lookup = NodeLookup() top_k = predictions.argsort()[-FLAGS.num_top_predictions:][::-1] for node_id in top_k: human_string = node_lookup.id_to_string(node_id) score = predictions[node_id] try: human_string_zh = iciba('Also known as ' + human_string, dst='zh') if sys.version_info.major <= 2: human_string_zh = human_string_zh.encode('utf-8') print('%s(%s)---(Matching = %.5f)' % (human_string, human_string_zh, score)) except (TranslateError, ConnectError): print('%s---(Matching = %.5f)' % (human_string, score))
def get_html_text_by_file(fileName): ''' 通过已经下载的html文件 提取其中的文本信息 :param fileName: html文件 :return: 提取的文件信息 ''' try: with open(fileName, 'r', encoding='utf-8', errors='ignore') as f: html = f.read() soup = BeautifulSoup(html, 'lxml') # 去除script与style [s.extract() for s in soup(['script', 'style'])] # 提取文本 拼接为一串 text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) print(text) # tansor = Translator() # # print(tansor.detect(text).lang) # if tansor.detect(text).lang == 'en': # text = tansor.translate(text, dest="zh-cn").text # print(text) try: print(detect(text)) if detect(text) != 'zh-cn': text = iciba(text, dst='zh') print(text) except: pass return text except IOError: return None
# from googletrans import Translator # translator = Translator() # print(translator.translate('hola')) # from translate import Translator # translator= Translator(from_lang="tr",to_lang="en") # print(translator.translate("Bu bir dolma kalemdir.")) from translation import baidu, google, youdao, iciba #print(google('hello world!', dst = 'zh-CN')) #print(google('hello world!', dst = 'ru')) print(baidu('hello world!', dst='zh')) print(baidu('hello world!', dst='ru')) print(youdao('hello world!', dst='zh-CN')) print(iciba('hello world!', dst='zh')) print(bing('hello world!', dst='zh-CHS'))
from translation import Translation, baidu, google, youdao, iciba, bing print(baidu('hello world!', dst = 'zh')) print(youdao('hello world!', dst = 'zh-CN')) print(iciba('hello world!', dst = 'zh')) print(google('hello world!', dst = 'zh-CN')) print(bing('hello world!', dst = 'zh-CHS'))
''' try: with open(fileName, 'r', encoding='utf-8', errors='ignore') as f: html = f.read() soup = BeautifulSoup(html, 'lxml') input_cnt = len(soup.find_all("input")) pwd_input_cnt = len(soup.find_all("input", attrs={"type": "password"})) print((input_cnt, pwd_input_cnt)) return (input_cnt, pwd_input_cnt) except: return (-1, -1) if __name__ == '__main__': #print(get_psychology_features("http://www.iep.utm.edu/aestheti/")) #print(get_psychology_features_by_url("https://en.wikipedia.org/wiki/How-to")) #count_page_input_by_url("https://shop.dma.org/") #count_page_input_by_file("test.html") #print(get_psychology_features_by_file("test.html")) text="English is good" print(detect("你好呀")) if detect(text) == 'en': text=iciba(text, dst = 'zh') print(text)