def 提華語句法樹(bunji="我 喜歡 豬", url='http://localhost:9000'): try: 句法分析器 = CoreNLPParser(url=url) except Warning as 錯誤: print('Warning=', 錯誤) 分析結果指標 = 句法分析器.parse(simplify(bunji).split()) 該句結果字串 = next(分析結果指標) return 該句結果字串 # 印字串 # (ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪))))) print('該句結果字串=', 該句結果字串) # 照字串印樹仔圖 # ROOT # | # IP # ___|____ # | VP # | ____|___ # NP | NP # | | | # PN VV NN # | | | # 我 喜欢 猪 該句結果字串.pretty_print() ##### 樹仔字串提出原始字串 a = Tree.fromstring("(ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))") # ['我', '喜欢', '猪'] print(a.leaves()) # (ROOT 我 喜欢 猪) print(a.flatten())
def Load(filename, stopwords, output_filename): global MIN_PERCENT global MIN_SUP candidate = set() for line in codecs.open(filename, 'r', 'utf-8'): if line[0].isdigit(): continue tokens = line.strip().split('\t') valid = False for token in tokens[2:]: support = int(token.split(':')[-2]) percentage = float(token.split(':')[-1][:-1]) if (percentage >= MIN_PERCENT) or (support >= MIN_SUP): name = ':'.join(token.split(':')[:-2]) valid = True if NoSeparator(name) and StopWordChecking(name, stopwords): candidate.add(name.lower()) if valid: name = tokens[0] if NoSeparator(name) and StopWordChecking(name, stopwords): candidate.add(name.lower()) out = codecs.open(output_filename, 'w', 'utf-8') if LANGUAGE == 'zh': seen = set() for name in candidate: name = simplify(''.join(name.split())) seen.add(name) candidate = seen print len(candidate) for name in candidate: out.write(name + '\n') out.close()
def seg(sentence): paraphrase = tokenizer.encode_plus(simplify(sentence), return_tensors="pt") paraphrase['attention_mask'][-1] = 0 # pdb.set_trace() for key in paraphrase.keys(): paraphrase[key] = paraphrase[key].to(device) paraphrase_classification_logits = model(**paraphrase)[0] paraphrase_results = paraphrase_classification_logits.argmax(axis=-1)[0] paraphrase_results = paraphrase_results[1:-1] # pdb.set_trace() res = list() length = 0 word = False for i in range(len(paraphrase_results)): if paraphrase_results[i] == 3: res.append(sentence[i]) length = 0 if paraphrase_results[i] == 0: length += 1 if paraphrase_results[i] == 1: length += 1 if paraphrase_results[i] == 2: res.append(sentence[i-length:i+1]) length = 0 #print(''.join(res) == sentence) print(' '.join(res), '\n') return ','.join(res) + '\n'
def Load(filename, stopwords, output_filename): global MIN_PERCENT global MIN_SUP candidate = set() for line in codecs.open(filename, 'r', 'utf-8'): if line[0].isdigit(): continue tokens = line.strip().split('\t') valid = False for token in tokens[3:]: support = int(token.split(':')[-2]) percentage = float(token.split(':')[-1][:-1]) if (percentage >= MIN_PERCENT) or (support >= MIN_SUP): name = ':'.join(token.split(':')[:-2]) valid = True if NoSeparator(name) and StopWordChecking(name, stopwords): candidate.add(name.lower()) if valid: name = tokens[0] if NoSeparator(name) and StopWordChecking(name, stopwords): candidate.add(name.lower()) out = codecs.open(output_filename, 'w', 'utf-8') if LANGUAGE == 'zh': seen = set() for name in candidate: name = simplify(''.join(name.split())) seen.add(name) candidate = seen print len(candidate) for name in candidate: out.write(name + '\n') out.close()
def preprocess_text(text, truncate_at=100): ''' 清除限定字符集外的内容,并截取前若干字符的文本. ''' truncated = text[:truncate_at] cleaned = REGEX_TO_REMOVE.sub(r'', truncated) return mafan.simplify(cleaned)
def can_st(page): simplified = simplify(page) == page.decode("utf8") traditional = tradify(page) == page.decode("utf8") # only simplified if simplified and not traditional and not config.zh_s: return False # only traditional elif traditional and not simplified and not config.zh_t: return False else: return config.zh_t or config.zh_s
def Load(filename, output_filename): candidate = set() for line in codecs.open(filename, 'r', 'utf-8'): tokens = line.strip().split('\t') for token in tokens[3:]: name = ':'.join(token.split(':')[:-2]) if LANGUAGE == 'zh': name = simplify(''.join(name.split())) candidate.add(name.lower()) name = tokens[0] #name = simplify(''.join(name.split(''))) if LANGUAGE == 'zh': name = simplify(''.join(name.split())) candidate.add(name.lower()) print len(candidate) out = codecs.open(output_filename, 'w', 'utf-8') for name in candidate: out.write(name + '\n') out.close()
def TM(filename, k, weights, simpmode=False): sys.stderr.write("Reading translation model from %s...\n" % (filename, )) tm = {} for line in open(filename).readlines(): (f, e, features) = line.strip().split(" ||| ") tm.setdefault(tuple(f.split()), []).append( phrase(e, [float(i) for i in features.strip().split()])) tmptm = {} for f in tm: # prune all but top k translations tm[f].sort(key=lambda x: sum(p * q for p, q in zip(x.features, weights)), reverse=True) del tm[f][k:] if simpmode: from mafan import simplify sf = tuple(simplify(f[i].decode('utf-8')) for i in range(len(f))) if sf != tuple(f[i].decode('utf-8') for i in range(len(f))): if sf in tm: for p in tm[f]: found = False for pi, sp in enumerate(tm[sf]): if sp.english == p.english: found = True tm[sf][pi].features[0] = ( tm[sf][pi].features[0] + p.features[0]) / 2 tm[sf][pi].features[1] = math.log( math.exp(tm[sf][pi].features[1]) + math.exp(p.features[1])) tm[sf][pi].features[2] = max( [tm[sf][pi].features[2], p.features[2]]) tm[sf][pi].features[3] = max( [tm[sf][pi].features[3], p.features[3]]) if not found: tm[sf].append( phrase(p.english, [ p.features[0] / 2, math.log(math.exp(p.features[1]) + 1), p.features[2], p.features[3] ])) else: for p in tm[f]: tmptm.setdefault(sf, []).append(p) for f in tmptm: tm[f] = tmptm[f] return tm
def StdNm(nonstd=None): ## make a DataFrame indexed by standardized name and ## contains columns "FirstName", "OtherNames", "OtherNames1" std = pd.read_csv("csv/StandardNames.csv") std["FullName"] = std["LastName"] + std["FirstName"] std["FullName"].fillna(method="ffill", inplace=True) std.set_index("FullName", inplace=True) #### Simplified characters for LastName happen sometimes std["ConvLast"] = [ sTR if pd.isnull(sTR) else mf.simplify(sTR) if mf.is_traditional(sTR) else mf.tradify(sTR) for sTR in std["LastName"] ] std["LastOth"] = std["LastName"].fillna(method="ffill") + std['OtherNames'] std["ConvFst"] = std["ConvLast"].fillna(method="ffill") + std['FirstName'] std["ConvOth"] = std["ConvLast"].fillna(method="ffill") + std['OtherNames'] std.drop(["Details", "Studio", "LastName", "ConvLast"], axis=1, inplace=True) ## make a dataframe of {key: alternative names, value: standard names} with ## unique keys and overlapping values map_df = pd.DataFrame() for colName in list(std.columns): df = pd.DataFrame({"key": std[colName], "value": std.index}) map_df = map_df.append(df, ignore_index=True) map_df.dropna(inplace=True) ## Standardize names in the given Series map_dict = map_df.set_index('key').to_dict()['value'] def standardize_names(participant): if participant in map_dict: return map_dict[participant] else: return participant ans = nonstd.map(standardize_names) return ans
def extract_glove_embeddings(): log("extract_glove_embeddings()...") _, word_to_index = read_list_file(word_file) word_list = [] embedding_list = [] with codecs.open(glove_file, "r", encoding="utf8") as f: for line in f: line = line.strip().split() word = mafan.simplify(line[0]) if word not in word_to_index: continue embedding = np.array([float(i) for i in line[1:]]) word_list.append(word) embedding_list.append(embedding) np.save(pretrained_word_file, word_list) np.save(pretrained_embedding_file, embedding_list) log(" %d pre-trained words\n" % len(word_list)) return
def process_language(ele): try: text_list = ele.text.split('\n') text_list = [replace_special(x) for x in text_list] languages = [guess_language(l) for l in text_list] en_index = [l == 'en' for l in languages] zh_index = [l == 'zh' for l in languages] en_lines = list(compress(text_list, en_index)) zh_lines = list(compress(text_list, zh_index)) if len(en_lines) == 0 or len(zh_lines) == 0: return None en_text = ' '.join(en_lines) if len(check_chinese(en_text)) > 0: return None zh_text = simplify(' '.join(zh_lines)) if len(check_english(zh_text)) > 0: return None text_line = en_text + ' | ' + zh_text return text_line except: return None
def dup_check(self, subtitles): ''' Check if there are dplicated subtitles. :returns: subtitles Subtitles without duplication. ''' # convert all encoding to utf-8, trad -> simp for sub in subtitles: enc = chardet.detect(sub['content'])['encoding'] # for simp. chinese if enc == 'GB2312': sub['content'] = sub['content'].decode('gb18030') else: sub['content'] = sub['content'].decode(enc) # for big5, convert! if enc == 'Big5': sub['content'] = mafan.simplify(sub['content']) # find dups dup_tags = [False] * len(subtitles) for i in range(len(subtitles)): for j in range(i+1, len(subtitles)): sub_a = subtitles[i]['content'] sub_b = subtitles[j]['content'] sim = difflib.SequenceMatcher(None, sub_a, sub_b).real_quick_ratio() logging.debug('Similar ratio between %d and %d is %f' % (i, j, sim)) if sim >= 0.9: if len(sub_a) >= len(sub_b): dup_tags[j] = True else: dup_tags[i] = True new_subs = [s for i, s in enumerate(subtitles) if dup_tags[i] == False] logging.debug('Total %d subtitles remains.' % len(new_subs)) return new_subs
def get_candidates(inputfile, tm, lm, weights, stack_size=10, nbest=None, simpmode=True, separate_unknown_words=False, verbose=False): if nbest is None: nbest = stack_size print >> sys.stderr, "Decoding: " + inputfile print >> sys.stderr, "Reading input..." french = [line.strip().split() for line in open(inputfile).readlines()] # list of list if simpmode: from mafan import simplify for li, line in enumerate(french): for wi, word in enumerate(line): french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8') # tm should translate unknown words as-is with a small probability # (i.e. only fallback to copying unknown words over as the last resort) for i in xrange(len(french)): j = 0 while j < len(french[i]): word = french[i][j] if (word, ) not in tm: flag = True if len(word) >= 2 and separate_unknown_words: for separate in xrange(1, len(word)): if (word[:separate], ) in tm and ( word[separate:], ) in tm: french[i][j] = word[:separate] j += 1 french[i].insert(j, word[separate:]) flag = False break if flag: tm[(word, )] = [ models.phrase(word, [unknown_word_logprob] * number_of_features_PT) ] j += 1 print >> sys.stderr, "Start decoding..." for n, f in enumerate(french): if verbose: print >> sys.stderr, "Input: " + ' '.join(f) # Generate cache for phrase segmentations. f_cache = generate_phrase_cache(f, tm) # Pre-calculate future cost table future_cost_table = precalcuate_future_cost( f, tm, weights[:number_of_features_PT]) # score = dot(features, weights) # features = sums of each log feature # predecessor = previous hypothesis # lm_state = N-gram state (the last one or two words) # last_frange = (i, j) the range of last translated phrase in f # phrase = the last TM phrase object (correspondence to f[last_frange]) # coverage = bit string representing the translation coverage on f # future_cost = a safe estimation to be added to total_score hypothesis = namedtuple( "hypothesis", "score, features, lm_state, predecessor, last_frange, phrase, coverage, future_cost" ) initial_hypothesis = hypothesis(0.0, [0.0] * number_of_features, lm.begin(), None, (0, 0), None, 0, 0) # stacks[# of covered words in f] (from 0 to |f|) stacks = [{} for _ in xrange(len(f) + 1)] # stacks[size][(lm_state, last_frange[1], coverage)]: # recombination based on (lm_state, last_frange[1], coverage). # For different hypotheses with the same tuple, keep the one with the higher score. # lm_state affects LM; last_frange affects distortion; coverage affects available choices. stacks[0][(lm.begin(), None, 0)] = initial_hypothesis for i, stack in enumerate(stacks[:-1]): if verbose: print >> sys.stderr, "Stack[%d]:" % i # Top-k pruning s_hypotheses = sorted(stack.values(), key=lambda h: h.score + h.future_cost, reverse=True) for h in s_hypotheses[:stack_size]: if verbose: print >> sys.stderr, h.score, h.lm_state, bin( h.coverage), ' '.join(f[h.last_frange[0]:h. last_frange[1]]), h.future_cost for (f_range, delta_coverage, tm_phrases) in enumerate_phrases(f_cache, h.coverage): # f_range = (i, j) of the enumerated next phrase to be translated # delta_coverage = coverage of f_range # tm_phrases = TM entries corresponding to fphrase f[f_range] length = i + f_range[1] - f_range[0] coverage = h.coverage | delta_coverage distance = abs(f_range[0] - h.last_frange[1]) # if distance > max_distance and i < len(stacks) / 2: # continue # TM might give us multiple candidates for a fphrase. for phrase in tm_phrases: features = h.features[:] # copy! # Features from phrase table for fid in range(number_of_features_PT): features[fid] += phrase.features[fid] # log_lmprob (N-gram) lm_state = h.lm_state loglm = 0.0 for word in phrase.english.split(): (lm_state, word_logprob) = lm.score(lm_state, word) loglm += word_logprob # Don't forget the STOP N-gram if we just covered the whole sentence. loglm += lm.end(lm_state) if length == len(f) else 0.0 features[4] += loglm # log distortion (distance ** alpha) features[5] += log(alpha) * distance # length of the translation (-length) features[6] += -len(phrase.english.split()) score = calculate_total_score(features, weights) future_list = get_future_list(coverage, len(f)) future_cost = get_future_cost(future_list, future_cost_table) new_state = (lm_state, f_range[1], coverage) new_hypothesis = hypothesis(score, features, lm_state, h, f_range, phrase, coverage, future_cost) # Recombination if new_state not in stacks[length] or \ score + future_cost > stacks[length][new_state].score + stacks[length][new_state].future_cost: stacks[length][new_state] = new_hypothesis winners = sorted(stacks[len(f)].values(), key=lambda h: h.score, reverse=True) if nbest == 1: yield extract_english(winners[0]) else: for s in winners[:nbest]: yield ("%d ||| %s |||" + " %f" * number_of_features) % \ ((n, extract_english(s)) + tuple(s.features)) print >> sys.stderr, "Decoding completed"
def get_candidates(input, tm, lm, weights, s=1): alpha = 0.95 #reordering parameter french = [list(line.strip().split()) for line in open(input).readlines()] for li, line in enumerate(french): for wi, word in enumerate(line): french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8') # tm should translate unknown words as-is with probability 1 for word in set(sum(french, [])): if (word, ) not in tm: tm[(word, )] = [models.phrase(word, [0.0, 0.0, 0.0, 0.0])] def generate_phrase_cache(f): cache = [] for i in range(0, len(f)): entries = [] bitstring = 0 for j in range(i + 1, len(f) + 1): bitstring += 1 << (len(f) - j) if tuple(f[i:j]) in tm: entries.append({ 'end': j, 'bitstring': bitstring, 'phrase': tm[tuple(f[i:j])] }) cache.append(entries) return cache def enumerate_phrases(f_cache, coverage): for i in range(0, len(f_cache)): bitstring = 0 for entry in f_cache[i]: if (entry['bitstring'] & coverage) == 0: yield ((i, entry['end']), entry['bitstring'], entry['phrase']) def precalcuate_future_cost(f): phraseCheapestTable = {} futureCostTable = {} for i in range(0, len(f)): for j in range(i + 1, len(f) + 1): if f[i:j] in tm: phraseCheapestTable[i, j] = -sys.maxint for phrase in tm[f[i:j]]: if phrase.logprob > phraseCheapestTable[i, j]: phraseCheapestTable[i, j] = phrase.logprob for i in range(0, len(f)): futureCostTable[i, 1] = phraseCheapestTable[i, i + 1] for j in range(2, len(f) + 1 - i): if (i, i + j) in phraseCheapestTable: futureCostTable[i, j] = phraseCheapestTable[i, i + j] else: futureCostTable[i, j] = -sys.maxint for k in range(1, j): if (((i + k, i + j) in phraseCheapestTable) and (futureCostTable[i, j] < futureCostTable[i, k] + phraseCheapestTable[i + k, i + j])): futureCostTable[i, j] = futureCostTable[ i, k] + phraseCheapestTable[i + k, i + j] return futureCostTable def get_future_list(bitstring): bitList = bin(bitstring)[2:] futureList = [] count = 0 index = 0 findZeroBit = False for i in range(len(bitList)): if bitList[i] == '0': if not findZeroBit: index = i findZeroBit = True count = count + 1 else: if findZeroBit: futureList.append((index, count)) findZeroBit = False count = 0 if findZeroBit: futureList.append((index, count)) return futureList def get_future_cost(bitList, futureCostTable): cost = 0 for item in bitList: cost = cost + futureCostTable[item] return cost def extract_english(h): return "" if h.predecessor is None else "%s%s " % (extract_english( h.predecessor), h.phrase.english) results = [] sys.stderr.write("Decoding %s...\n" % (input, )) for n, f in enumerate(french): # Generate cache for phrase segmentations. f_cache = generate_phrase_cache(f) # Pre-calculate future cost table #future_cost_table = precalcuate_future_cost(f) # logprob = log_lmprob + log_tmprob + distortion_penalty # predecessor = previous hypothesis # lm_state = N-gram state (the last one or two words) # last_frange = (i, j) the range of last translated phrase in f # phrase = the last TM phrase object (correspondence to f[last_frange]) # coverage = bit string representing the translation coverage on f # future_cost hypothesis = namedtuple( "hypothesis", "logprob, features, lm_score, lm_state, predecessor, last_frange, phrase, coverage" ) initial_hypothesis = hypothesis(0.0, [0.0, 0.0, 0.0, 0.0], 0.0, lm.begin(), None, (0, 0), None, 0) # stacks[# of covered words in f] (from 0 to |f|) stacks = [{} for _ in range(len(f) + 1)] # stacks[size][(lm_state, last_frange, coverage)]: # recombination based on (lm_state, last_frange, coverage). # For different hypotheses with the same tuple, keep the one with the higher logprob. # lm_state affects LM; last_frange affects distortion; coverage affects available choices. stacks[0][(lm.begin(), None, 0)] = initial_hypothesis for i, stack in enumerate(stacks[:-1]): # Top-k pruning for h in sorted(stack.itervalues(), key=lambda h: -h.logprob)[:s]: for (f_range, delta_coverage, tm_phrases) in enumerate_phrases(f_cache, h.coverage): # f_range = (i, j) of the enumerated next phrase to be translated # delta_coverage = coverage of f_range # tm_phrases = TM entries corresponding to fphrase f[f_range] length = i + f_range[1] - f_range[0] coverage = h.coverage | delta_coverage distance = f_range[0] - h.last_frange[1] # TM might give us multiple candidates for a fphrase. for phrase in tm_phrases: # log_tmprob and distortion features = map(add, h.features, phrase.features) # log_lmprob (N-gram) lm_state = h.lm_state lm_score = h.lm_score for word in phrase.english.split(): (lm_state, word_logprob) = lm.score(lm_state, word) lm_score += word_logprob # Don't forget the STOP N-gram if we just covered the whole sentence. lm_score += lm.end(lm_state) if length == len( f) else 0.0 # Future cost. #future_list = get_future_list(delta_coverage) #future_cost = get_future_cost(future_list, future_cost_table) logprob = sum( p * q for p, q in zip((features + [lm_score]), weights)) new_state = (lm_state, f_range, coverage) new_hypothesis = hypothesis(logprob, features, lm_score, lm_state, h, f_range, phrase, coverage) if new_state not in stacks[length] or \ logprob > stacks[length][new_state].logprob: # recombination stacks[length][new_state] = new_hypothesis winner = sorted(stacks[len(f)].itervalues(), key=lambda h: h.logprob, reverse=True)[0:100] for i in range(len(winner)): results += [ "%d ||| %s ||| %f %f %f %f %f" % (n, extract_english(winner[i]), winner[i].features[0], winner[i].features[1], winner[i].features[2], winner[i].features[3], winner[i].lm_score) ] return results
from mafan import encoding from mafan import text from mafan import simplify, tradify from mafan import split_text import mafan from mafan import pinyin # 对包含其他编码的文本转化成utf8格式 # filename = 'test.txt' # name or path of file as string # encoding.convert(filename) # creates a file with name 'ugly_big5_utf-8.txt' in glorious utf-8 encoding # 简体和繁体转化 print('-' * 50) string = u'这是麻烦啦' print(tradify(string)) # convert string to traditional print(simplify(tradify(string))) # convert back to simplified # 是否包含符号或者拉丁字符 print('-' * 50) flag = text.has_punctuation(u'这是麻烦啦') print(flag) flag = text.has_punctuation(u'这是麻烦啦.') print(flag) flag = text.has_punctuation(u'这是麻烦啦。') print(flag) flag = text.contains_latin(u'这是麻烦啦。') print(flag) flag = text.contains_latin(u'You are麻烦啦。') print(flag) # 判断是否是简体还是繁体
def join_elements(candidate): seen = set() for name in candidate: name = simplify(''.join(name.split())) seen.add(name) return seen
def simplify(): text = request.args.get('text') d = {'text': mafan.simplify(text)} return jsonify(**d)
line = fp.readline() while line: content = json.loads(line) text_a = content.get("question") text_b = content.get("answer") label = content.get("yesno_answer") text_a = text_a.replace(' ', '') text_b = text_b.replace(' ', '') text_a = text_a.replace('\t', '') text_b = text_b.replace('\t', '') text_a = text_a.replace('?', '?') text_b = text_b.replace('?', '?') if text_a[-1] != "?": text_a = text_a + "?" text_a = mafan.simplify(text_a) text_b = mafan.simplify(text_b) if label == "" and filename != "test": print("miss label") label = "Yes" '''if filename != "test": out_line = text_a + '\t' + text_b + '\t' + label else: out_line = text_a + '\t' + text_b''' out_line = [text_a, text_b] out_arr.append(out_line) length_arr.append(len(text_a + text_b)) line = fp.readline() print(filename + " aver length is:%d", sum(length_arr) / len(length_arr)) print(filename + " max length is:%d", max(length_arr))
def convert(): return simplify(request.args.get("word"))