def mmseg_test(): string = "最主要 的更 动是:张无忌最后没有选定自己的配偶。自己的自己" print(seg_txt(string)) output = "" for i in seg_txt(string): output += i + " " print(output)
def get_chinese_similarity(s1, s2): """ Get the similarity of two chinese word """ hash1 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s1)) ]) hash2 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s2)) ]) return hash1.similarity(hash2)
def Classify(self, text): text = re.sub(r'\d', r' ', text) text_words = [w for w in mmseg.seg_txt(text)] #print text_words category = -1 max_weight = 0.0 best_unknown_weight = 0.0 for cat_id, word_weights in self.cat_word_weight_.items(): #print '---------------------------' weight = 0.0 unknown_weight = 0.0 for word in text_words: if word in word_weights: w = word_weights[word] * (3 ** ((len(word) - 1) / 3)) weight += w unknown_weight -= len(word) * 0.6 #print word, w else: if word not in self.stop_words_: unknown_weight += len(word) * 1.0 #print word, 'unknown' pass if weight > max_weight and unknown_weight < 0.0: max_weight = weight category = cat_id best_unknown_weight = unknown_weight # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight return self.cat_id_name_map_[category]
def put(self,title,item_id): """ title --> segment --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0) --> pinyin --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0) """ if not title or not item_id: return for phrase in mmseg.seg_txt(title.encode('utf8')): if not phrase: continue phrase = phrase.decode('utf8') self._add_phrase(chinese_key(phrase),item_id) for (key,suffix,score) in self._gen_suffix(phrase): self._add_suffix(key,chinese_key(suffix),score) if not self.pinyin: continue phrase = self.pinyin.translate(phrase) if not phrase: continue for sub_phrase in self._gen_pinyin_phrase(phrase): self._add_phrase(sub_phrase,item_id) for (key,suffix,score) in self._gen_suffix(re.sub('\\s+','',sub_phrase)): self._add_suffix(key,suffix,score)
def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1+log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def remove_stop_words(self,text): tokens=mmseg.seg_txt(text) left_words=[] for t in tokens: if t not in self.stopwords: left_words.append(t) return "".join(left_words)
def train(filename, parser): fname = basename(filename) cache_path = join(CACHE_PATH, fname) if exists(cache_path): return word2tag_count = {} for tag_id_list, txt in parser(filename): if not txt.strip(): continue tag_id_set = set(tag_id_list) if not tag_id_set: continue for tid in tuple(tag_id_set): tag_id_set.update(PTAG.get(tid, ())) word2count = defaultdict(int) word_list = list(seg_txt(utf8_ftoj(str(txt)))) for i in word_list: word2count[i] += 1 for k, v in word2count.iteritems(): if k not in word2tag_count: word2tag_count[k] = {} t = word2tag_count[k] for id in tag_id_set: if id not in t: t[id] = 0 t[id] += (1 + log(float(v))) tofromfile.tofile(cache_path, word2tag_count)
def suggest(self,phrase,start=1,limit=10,namespace='',expires=600): temp = re.split('\s+',phrase.strip()) phrase = [item for item in mmseg.seg_txt(phrase.encode('utf8'))] phrase.extend(temp) phrase = map(chinese_key,phrase) start = (start-1)*limit result_key = 'ac-suggest:' + '|'.join(phrase) results = self.r.zrevrange(result_key,start,start+limit-1) if results: return results prefix = self.suffix_key_prefix + self.namespace prefix_len = len(prefix) phrase_keys = [] for sub_phrase in phrase: key = prefix + sub_phrase results = self._suggest(key, limit) # strip the prefix off the keys that indicated they matched a lookup cleaned_keys = map(lambda x: x[prefix_len:], results) cleaned_keys = map(lambda x: self.phrase_key_prefix + self.namespace+x, cleaned_keys) phrase_keys.extend(cleaned_keys) if not phrase_keys: return [] #union all num = self.r.zinterstore(result_key,list(set(phrase_keys))) self.r.expire(result_key,expires) #results results = self.r.zrevrange(result_key,start,start+limit-1) return results
def split_words(text): """docstring for split_words""" words = [] for i in seg_txt(text): words.append(i) return words
def GetTermsFrequency(text): ret = {} for w in mmseg.seg_txt(text): w = w.strip() if len(w) > 0: ret.setdefault(w, 0) ret[w] += 1 return ret
def store_movie(movie): phrase = movie["title"] seg_phrase = " ".join(mmseg.seg_txt(phrase)) _pinyin_phrase = pinyin.get_pinyin(phrase) py_phrase = "".join([p[0] for p in _pinyin_phrase]).encode("utf-8") pinyin_phrase = "".join(_pinyin_phrase).encode("utf-8") phrase = "%s %s %s %s" % (phrase, seg_phrase, pinyin_phrase, py_phrase) engine.store_json(movie["id"], phrase, movie)
def tf_idf(self, txt): tf = defaultdict(int) for i in seg_txt(str(txt.lower())): tf[i] += 1 result = [] for k, v in tf.iteritems(): if k in self._idf: result.append((k, v*self._idf[k])) return result
def tf_idf(self, txt): tf = defaultdict(int) for i in seg_txt(str(txt.lower())): tf[i] += 1 result = [] for k, v in tf.iteritems(): if k in self._idf: result.append((k, v * self._idf[k])) return result
def generate_segmented_content_file(self): my_file = file('ordered_segmented_content_file.txt', 'w') with open('ordered_content_file.txt') as f: for line in f: print "正在对第{0}行进行分词操作……".format(self.count) for segment in seg_txt(line): my_file.write(segment + ' ') my_file.write('\n') self.count += 1 my_file.close()
def parse(self, words): words = SearchIndex.__to_unicode(words) _seg_words = [word for word in seg_txt(words)] seg_words = filter(None, _seg_words) results = [] for word in seg_words: word_utf8 = SearchIndex.__to_unicode(word) decode_word = unidecode(word_utf8) key = self.cache_key_prefix + slugify(decode_word) results.append(key) return results
def get_terms(self): values = [] for field in self._fields: values.append(self._data[field].encode('utf8')) text = ' '.join(values) terms = [] for term in seg_txt(text): terms.append(term.decode('utf8')) return terms
def gen_terms(cont): if cont is None: return [] cont = cont.strip() if len(cont)==0: return [] if len(cont)<TERM_MIN_LENGTH: return [] terms = [item for item in seg_txt(cont) if len(item)>TERM_MIN_LENGTH] if len(cont)<10: terms.append(cont) terms = list(set(terms)) return terms
def main(): count = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) wds = mmseg.seg_txt(text.encode('utf-8')) o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8') o.write((' '.join(wds)).decode('utf-8')) o.close() count = count + 1 print '---------------------------------------------------------------' print time.time() - start print count print '---------------------------------------------------------------'
def generate_user_dict(w_uid): user = Account.objects.get(w_uid=w_uid) wbs = user.watchweibo.all() wordset = Set() print 'Generating user dict with %d weibo to deal with' % len(wbs) for wb in wbs: for word in seg_txt(wb.text.encode('utf-8','ignore')): if len(word)>3: wordset.add(word.lower().strip()) with open("../data/user_dict/%s.dic" % w_uid, "w") as dic_file: for word in wordset: dic_file.write("%s\n" % word)
def input_raw(self, sentence, is_spam): """ 训练数据 :param sentence: 训练的句子 :param is_spam: 是否时垃圾短信 :return: """ sms = mmseg.seg_txt(sentence) sms = list(sms) for flag, word in enumerate(sms): offset = 0 if is_spam else 1 if word not in self.sms_value: self.sms_value[word] = [1 - offset, offset] else: self.sms_value[word][offset] += 1 self.sms_count[offset] += 1
def segment(string): # alphas = '' # unicode = '' # last_is_alpha = False # for char in string: # if char.isalpha(): # if not last_is_alpha: # alphas += ' ' # alphas += char # else: # if last_is_alpha: # unicode += ' ' # unicode += char # print "ALPHAS", alphas # print "UNICODE", unicode[:20] # return alphas + u' '.join([ txt.decode('utf8') for txt in seg_txt( unicode.encode('utf8')) ]) return u' '.join([ txt.decode('utf8') for txt in seg_txt( string.encode('utf8')) ])
def count_occurance(self, text=''): if not isinstance(text, basestring): raise Exception("input must be instance of String") separated_by_non_alphanumerics = text.replace('/',' ').replace('\\',' ').replace('>',' ').replace('<',' ').lower() #print separated_by_non_alphanumerics without_one_or_two_words = self.__class__.one_or_two_words_re.sub('', separated_by_non_alphanumerics) without_dots = without_one_or_two_words.replace(".", "") text_chunks = self.stopwords.to_re().sub('', without_dots).split() frequencies = {} for word in text_chunks: seg = mmseg.seg_txt(word) for s in seg: frequencies[s] = (frequencies[s] if frequencies.has_key(s) else 0) + 1 return frequencies
def create_action(self): cache_key = "WEIBO:HOT:%s" %self.user.sns_id cache.delete(cache_key) tmp_cache_key = "TEMP:WEIBO:HISTORY:%s:::" %self.user.sns_id weibo_history = self.user.weibo_history for text in weibo_history: terms = seg_txt(text.encode('utf-8')) for term in terms: index_key = '%s%s' %(BASIC_TAG_PREFIX, term) if cache.exists(index_key): key = tmp_cache_key + term.decode('utf-8') cache.incr(name=key, amount=1) keys = cache.keys(pattern="%s*" %tmp_cache_key) for key in keys: name = key.split(":::")[1] value = float(cache.get(key)) cache.zadd(cache_key, value, name) cache.delete(key) tag = BasicTag.get_by_name(name=name) if not tag: continue relations = tag.friends score = tag.score for f in relations: items = f.split(':::') obj_name = items[0] obj_value = float(items[1]) result = obj_value/50*value cache.zadd(cache_key, result, obj_name) results = cache.zrevrange(name=cache_key, start=0, num=30, withscores=True) tags = [result[0].decode('utf-8') +'__' + str(result[1]) for result in results] self.user.update(set__tags=tags)
def remove(self,title,item_id): if not title or not item_id: return for phrase in mmseg.seg_txt(title.encode('utf8')): if not phrase: continue phrase = phrase.decode('utf8') self._rem_phrase(chinese_key(phrase),item_id) if not self.pinyin: continue phrase = self.pinyin.translate(phrase) if not phrase: continue for sub_phrase in self._gen_pinyin_phrase(phrase): self._rem_phrase(sub_phrase,item_id)
def get(url, headers, body): query = headers.get('QUERY') if query == None or query.strip() == '': return 400, 'Bad Request', 'query field is not found.', None params = dict((n,v) for n, v in (i.split('=', 1) for i in query.split('&'))) if 'query' not in params: return 400, 'Bad Request', 'query field is not found.', None text = params['query'] search_query = helpers.decode_urlencoding(text) # helpers.log_search_query(search_query) global logger logger.debug('incoming query: %s', text) terms = seg_txt(search_query) logger.debug('terms from query: %s', terms) database = xapian.Database('../indexes/') enquire = xapian.Enquire(database) l = [] for term in terms: l.append(term) q = xapian.Query(xapian.Query.OP_OR, l) enquire.set_query(q) matches = enquire.get_mset(0, 100) print '%i results found.' % matches.get_matches_estimated() print 'Result - %i:' % matches.size() r = [] for m in matches: # print '%i: %i%% docid=%i [%s]' % (m.rank + 1, m.percent, m.docid,\ # m.document.get_data()) r.append(m.document.get_data()) print json.dumps(r) return 200, 'OK', json.dumps(r), None
def predict(self, sentence): def is_zero(value): return value if value > 0 else 0.01 sms = mmseg.seg_txt(sentence) sms = set(sms) sms_prob_ham = sms_prob_spam = 1 for flag, word in enumerate(sms): word_prob_spam = word_prob_ham = 0 if word in self.sms_value: value = self.sms_value[word] word_prob_spam = float( value[0]) / self.sms_count[0] # 这个词为spam的概率 word_prob_ham = float( value[1]) / self.sms_count[1] # 这个词为healthy的概率 word_prob_spam = is_zero(word_prob_spam) word_prob_ham = is_zero(word_prob_ham) ##计算 prob_is_spam = word_prob_spam / (word_prob_spam + word_prob_ham ) # 其中word_prob_ham为补集 sms_prob_spam *= prob_is_spam sms_prob_ham *= (1 - prob_is_spam) return sms_prob_spam / (sms_prob_spam + sms_prob_ham)
def normalize_syn_words(self,text): tokens=mmseg.seg_txt(text) word_list=[x for x in tokens] wlist_len=len(word_list) for i in xrange(wlist_len): if word_list[i] == "": continue curr_len=0 j = i while j < wlist_len: curr_len+=len(word_list[j]) if curr_len > self.max_len_to_replace: break j+=1 while j > i: wrf="".join(word_list[i:j]) if wrf in self.replace_dict: wrt=self.replace_dict[wrf] word_list[i] = wrt for k in xrange(i+1,j): word_list[k] = "" break j-=1 return "".join(word_list)
def Classify(self, text): text = re.sub(r'\d', r' ', text) text_words = [w for w in mmseg.seg_txt(text)] #print text_words category = -1 max_weight = 0.0 best_unknown_weight = 0.0 for cat_id, word_weights in self.cat_word_weight_.items(): #print '---------------------------' weight = 0.0 unknown_weight = 0.0 for word in text_words: if len(word.strip()) == 0: continue if word in word_weights: w = word_weights[word] * (3 ** ((len(word) - 1) / 3)) weight += w unknown_weight -= len(word) * 0.87 #print word, w else: if word not in self.stop_words_: if not word[0].isalpha(): # 忽略掉不认识的英文单词 unknown_weight += len(word) * 1.0 #print word, 'unknown' else: #print word, 'stop word' pass #print 'unknown_weight', unknown_weight if weight > max_weight and unknown_weight < 0.0: max_weight = weight category = cat_id best_unknown_weight = unknown_weight # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight return self.cat_id_name_map_[category]
def txt2word(txt): return seg_txt(utf8_ftoj(str(txt.lower())))
#encoding=utf-8 import mmseg #from pymmseg import mmseg #mmseg.dict_load_defaults() f = open('MMSEGoutput.txt', 'w') input = open('testinput.txt') while True: text = input.readline() for i in mmseg.seg_txt(text): print >> f, i, ' ', #f.write(testseg) print >> f if len(text) == 0: break f.flush() f.close() input.close() #f=open('1.txt','w') #for i in mmseg.seg_txt(text): #print >>f,i #algor = mmseg.Algorithm(text) #for tok in algor: #print >>f,'%s [%d..%d]' % (tok.text, tok.start, tok.end) #print '%s' % tok.text
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' main.py Author: WooParadog Email: [email protected] Created on 2011-11-13 ''' import mmseg import mmseg.search f = open('text') dic = {} for word in mmseg.seg_txt(f.read()): if word in dic.keys(): dic[word] = int(dic[word] + 1) else: dic[word] = 1 f.close() import operator word = sorted(dic.iteritems(), key=operator.itemgetter(1), reverse=True) print word f = open('word', 'w') f.writelines([str(k) + ":" + str(v) + "\n" for (k, v) in word]) f.close()
n = 0 for name in names: f = os.path.join(dir, name) print '\nFile: ', f, '...' nout = name + '.txt' if os.path.exists(nout): print '-- SKIPPED' continue fout = open(nout, 'w') subject, text = read_eml(f) # words = fc(subject) words = seg_txt(subject) fout.write('{}\n\n'.format(' '.join(words))) lines = text.splitlines() for line in lines: #text = '感谢您关注语言云,您的语言云账号已经激活。这封邮件包含您调用语言云服务时使用的token,以及一些其他帮助您快速使用语言云的信息。' line = line.strip() # print '[', line, ']' if line <> '': #words = fc(line) words = seg_txt(line) fout.write(' '.join(words) + '\n') for w in words: print w fout.close()
def append(self, txt): for i in set(seg_txt(str(txt.lower()))): self._idf[i] += 1 self._count += 1
def generate_feature(wb, dict): fea = [0]*len(dict) # 微博文本 for wd in seg_txt(wb.text.encode('utf-8','ignore')): word_count = 0 wd = wd.lower().strip() if len(wd)>3 and wd in dict: fea[dict[wd]] += 1 word_count += 1 print 'found %d word in a weibo' % word_count # add user features owner = wb.owner fea.append(int(owner.w_province)) fea.append(int(owner.w_city)) if owner.w_url: fea.append(1) else: fea.append(0) fea.append(len(owner.w_description)) if 'm' in owner.w_gender: fea.append(1) else: fea.append(0) fea.append(int(owner.w_followers_count)) fea.append(int(owner.w_friends_count)) fea.append(int(owner.w_statuses_count)) fea.append(int(owner.w_favourites_count)) fea.append(int(owner.w_bi_followers_count)) fea.append((datetime.now()-owner.w_created_at).days/100) if owner.w_verified: fea.append(1) else: fea.append(0) # add weibo features fea.append(int(wb.reposts_count)) fea.append(int(wb.comments_count)) fea.append(int(wb.attitudes_count)) if re.search("#.*?#", wb.text): fea.append(1) else: fea.append(0) fea.append(len(wb.text)) own_text = re.search("(.*?)//@", wb.text) if own_text: fea.append(len(own_text.group(1))) else: fea.append(len(wb.text)) #TODO 对source归类 fea.append(len(wb.source)) if wb.retweeted_status: fea.append(0) else: fea.append(1) if wb.thumbnail_pic: fea.append(1) else: fea.append(0) fea.append(wb.created_at.hour) fea.append(wb.created_at.weekday()) # TODO 计算微博转发评论的衰减公式 return fea
def txt_tag_generator(self): word2id = self.word2id for k, v in self._txt_tag_generator(): words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()] yield word2id.id_list_by_word_list(words) , v
def tf_idf_seg_txt(txt): txt = txt.replace('。', ' ').replace(',', ' ') word_list = list(seg_txt(txt)) return tf_idf(word_list)
def tokenize(self, stream): import mmseg for chunk in self._imp_tokenizer.tokenize(stream): r = mmseg.seg_txt(chunk.encode('utf8', 'ignore')) for word in r: yield word.decode('utf8', 'ignore')
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' main.py Author: WooParadog Email: [email protected] Created on 2011-11-13 ''' import mmseg import mmseg.search f = open('text') dic = {} for word in mmseg.seg_txt(f.read()): if word in dic.keys(): dic[word] = int(dic[word] + 1) else: dic[word] = 1 f.close() import operator word = sorted(dic.iteritems(), key=operator.itemgetter(1),reverse=True) print word f = open('word','w') f.writelines([str(k)+":"+str(v)+"\n" for (k,v) in word]) f.close()
def txt_tag_generator(self): word2id = self.word2id for k, v in self._txt_tag_generator(): words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()] yield word2id.id_list_by_word_list(words), v
def separatewords(self,text): print [s.lower() for s in seg_txt(text.encode('utf-8')) if s!=''] return [s.lower() for s in seg_txt(text.encode('utf-8')) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: print "indexed :",url return True return False def addlinkref(self,urlFrom,urlTo,linkText): fromid=self.getentryid('urllist','url',urlFrom) toid=self.getentryid('urllist','url',urlTo) cur=self.con.execute( "select rowid from link where fromid='%s' and toid='%s'" % (fromid,toid)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into link (fromid,toid) values ('%s','%s')" %(fromid,toid)) linkid=cur.lastrowid else: linkid=res[0] words=self.separatewords(linkText) for word in words: wordid=self.getentryid('wordlist','word',word) cur=self.con.execute("insert into linkwords (wordid,linkid) values ('%s','%s')" %(linkid,wordid)) def crawl(self,pages,depth=2): for i in range(depth): newpages=set() for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue soup=BeautifulSoup(c.read()) if not self.isindexed(page): self.addtoindex(page,soup) else: continue links=soup('a') for link in links: if('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] if url[0:4]=='http' and not self.isindexed(url): newpages.add(url) linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() pages=newpages def createindextables(self): self.con.execute('create table IF NOT EXISTS urllist(url)') self.con.execute('create table IF NOT EXISTS wordlist(word)') self.con.execute('create table IF NOT EXISTS wordlocation(urlid interger,wordid interger,location)') self.con.execute('create table IF NOT EXISTS link(fromid integer,toid integer)') self.con.execute('create table IF NOT EXISTS linkwords(wordid interger,linkid interger)') self.con.execute('create index IF NOT EXISTS wordidx on wordlist(word)') self.con.execute('create index IF NOT EXISTS urlidx on urllist(url)') self.con.execute('create index IF NOT EXISTS wordurlidx on wordlocation(wordid)') self.con.execute('create index IF NOT EXISTS urltoidx on link(toid)') self.con.execute('create index IF NOT EXISTS urlfrom on link(fromid)') self.dbcommit() def calculatepagerank(self,iterations=20): self.con.execute('drop table if exists pagerank') self.con.execute('create table pagerank(urlid primary key,score)') self.con.execute('insert into pagerank select rowid, 1.0 from urllist') self.dbcommit() for i in range(iterations): print "Iteration %d" % (i) for (urlid,) in self.con.execute('select rowid from urllist'): pr=0.15 for (linker,) in self.con.execute('select distinct fromid from link where toid=%d' % urlid): linkingpr=self.con.execute('select score from pagerank where urlid=%d' % linker).fetchone()[0] linkingcount=self.con.execute('select count(*) from link where fromid=%d' % linker).fetchone()[0] pr+=0.85*(linkingpr/linkingcount) self.con.execute('update pagerank set score=%f where urlid=%d' % (pr,urlid)) self.dbcommit()
#!/usr/bin/env python #coding:utf-8 import sys from __future__ import print_function from mmseg import seg_txt for line in sys.stdin: blks = str.split(line) out_line = blks[0] for i in range(1, len(blks)): if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]": out_line += " " + blks[i] continue for j in seg_txt(blks[i]): out_line += " " + j print(out_line)
#coding:utf-8 #!/usr/bin/env python import sys from mmseg import seg_txt for line in sys.stdin: blks = str.split(line) out_line = blks[0] for i in range(1, len(blks)): if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]": out_line += " " + blks[i] continue for j in seg_txt(blks[i]): out_line += " " + j print out_line
#!/usr/bin/env python3 # coding:utf-8 import sys from mmseg import seg_txt for line in sys.stdin: blks = str.split(line) out_line = blks[0] for i in range(1, len(blks)): if (blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]"): out_line += " " + blks[i] continue for j in seg_txt(blks[i].encode()): out_line += " " + j.decode() print(out_line)