def 提華語句法樹(bunji="我 喜歡 豬", url='http://localhost:9000'):
    try:
        句法分析器 = CoreNLPParser(url=url)
    except Warning as 錯誤:
        print('Warning=', 錯誤)

    分析結果指標 = 句法分析器.parse(simplify(bunji).split())
    該句結果字串 = next(分析結果指標)

    return 該句結果字串

    # 印字串
    # (ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))
    print('該句結果字串=', 該句結果字串)

    # 照字串印樹仔圖
    # ROOT
    #      |
    #      IP
    #   ___|____
    #  |        VP
    #  |    ____|___
    #  NP  |        NP
    #  |   |        |
    #  PN  VV       NN
    #  |   |        |
    #  我   喜欢       猪
    該句結果字串.pretty_print()

    ##### 樹仔字串提出原始字串
    a = Tree.fromstring("(ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))")
    # ['我', '喜欢', '猪']
    print(a.leaves())
    # (ROOT 我 喜欢 猪)
    print(a.flatten())
def Load(filename, stopwords, output_filename):
    global MIN_PERCENT
    global MIN_SUP

    candidate = set()
    for line in codecs.open(filename, 'r', 'utf-8'):
        if line[0].isdigit():
            continue
        tokens = line.strip().split('\t')
        valid = False
        for token in tokens[2:]:
            support = int(token.split(':')[-2])
            percentage = float(token.split(':')[-1][:-1])
            if (percentage >= MIN_PERCENT) or (support >= MIN_SUP):
                name = ':'.join(token.split(':')[:-2])
                valid = True
                if NoSeparator(name) and StopWordChecking(name, stopwords):
                    candidate.add(name.lower())
        if valid:
            name = tokens[0]
            if NoSeparator(name) and StopWordChecking(name, stopwords):
                candidate.add(name.lower())
    out = codecs.open(output_filename, 'w', 'utf-8')
    if LANGUAGE == 'zh':
        seen = set()
        for name in candidate:
            name = simplify(''.join(name.split()))
            seen.add(name)
        candidate = seen
    print len(candidate)
    for name in candidate:
        out.write(name + '\n')
    out.close()
예제 #3
0
def seg(sentence):
    paraphrase = tokenizer.encode_plus(simplify(sentence), return_tensors="pt")
    paraphrase['attention_mask'][-1] = 0
    # pdb.set_trace()
    for key in paraphrase.keys():
        paraphrase[key] = paraphrase[key].to(device)

    paraphrase_classification_logits = model(**paraphrase)[0]
    paraphrase_results = paraphrase_classification_logits.argmax(axis=-1)[0]
    paraphrase_results = paraphrase_results[1:-1]
    # pdb.set_trace()

    res = list()
    length = 0
    word = False
    for i in range(len(paraphrase_results)):
        if paraphrase_results[i] == 3:
            res.append(sentence[i])
            length = 0
        if paraphrase_results[i] == 0:
            length += 1
        if paraphrase_results[i] == 1:
            length += 1
        if paraphrase_results[i] == 2:
            res.append(sentence[i-length:i+1])
            length = 0
    #print(''.join(res) == sentence)
    print(' '.join(res), '\n')
    return ','.join(res) + '\n'
예제 #4
0
def Load(filename, stopwords, output_filename):
    global MIN_PERCENT
    global MIN_SUP

    candidate = set()
    for line in codecs.open(filename, 'r', 'utf-8'):
        if line[0].isdigit():
            continue
        tokens = line.strip().split('\t')
        valid = False
        for token in tokens[3:]:
            support = int(token.split(':')[-2])
            percentage = float(token.split(':')[-1][:-1])
            if (percentage >= MIN_PERCENT) or (support >= MIN_SUP):
                name = ':'.join(token.split(':')[:-2])
                valid = True
                if NoSeparator(name) and StopWordChecking(name, stopwords):
                    candidate.add(name.lower())
        if valid:
            name = tokens[0]
            if NoSeparator(name) and StopWordChecking(name, stopwords):
                candidate.add(name.lower())
    out = codecs.open(output_filename, 'w', 'utf-8')
    if LANGUAGE == 'zh':
        seen = set()
        for name in candidate:
            name = simplify(''.join(name.split()))
            seen.add(name)
        candidate = seen
    print len(candidate)
    for name in candidate:
        out.write(name + '\n')
    out.close()
예제 #5
0
def preprocess_text(text, truncate_at=100):
    '''
    清除限定字符集外的内容,并截取前若干字符的文本.
    '''
    truncated = text[:truncate_at]
    cleaned = REGEX_TO_REMOVE.sub(r'', truncated)

    return mafan.simplify(cleaned)
예제 #6
0
def can_st(page):
    simplified = simplify(page) == page.decode("utf8")
    traditional = tradify(page) == page.decode("utf8")
    # only simplified
    if simplified and not traditional and not config.zh_s:
        return False
    # only traditional
    elif traditional and not simplified and not config.zh_t:
        return False
    else:
        return config.zh_t or config.zh_s
예제 #7
0
def can_st(page):
	simplified = simplify(page) == page.decode("utf8")
	traditional = tradify(page) == page.decode("utf8")
	# only simplified
	if simplified and not traditional and not config.zh_s:
		return False
	# only traditional
	elif traditional and not simplified and not config.zh_t:
		return False
	else:
		return config.zh_t or config.zh_s
예제 #8
0
def Load(filename, output_filename):
    candidate = set()
    for line in codecs.open(filename, 'r', 'utf-8'):
        tokens = line.strip().split('\t')
        for token in tokens[3:]:
            name = ':'.join(token.split(':')[:-2])

            if LANGUAGE == 'zh':
                name = simplify(''.join(name.split()))
            candidate.add(name.lower())

        name = tokens[0]
        #name = simplify(''.join(name.split('')))
        if LANGUAGE == 'zh':
            name = simplify(''.join(name.split()))
        candidate.add(name.lower())
    print len(candidate)

    out = codecs.open(output_filename, 'w', 'utf-8')
    for name in candidate:
        out.write(name + '\n')
    out.close()
예제 #9
0
def TM(filename, k, weights, simpmode=False):
    sys.stderr.write("Reading translation model from %s...\n" % (filename, ))
    tm = {}
    for line in open(filename).readlines():
        (f, e, features) = line.strip().split(" ||| ")
        tm.setdefault(tuple(f.split()), []).append(
            phrase(e, [float(i) for i in features.strip().split()]))

    tmptm = {}
    for f in tm:  # prune all but top k translations
        tm[f].sort(key=lambda x: sum(p * q
                                     for p, q in zip(x.features, weights)),
                   reverse=True)
        del tm[f][k:]
        if simpmode:
            from mafan import simplify
            sf = tuple(simplify(f[i].decode('utf-8')) for i in range(len(f)))
            if sf != tuple(f[i].decode('utf-8') for i in range(len(f))):
                if sf in tm:
                    for p in tm[f]:
                        found = False
                        for pi, sp in enumerate(tm[sf]):
                            if sp.english == p.english:
                                found = True
                                tm[sf][pi].features[0] = (
                                    tm[sf][pi].features[0] + p.features[0]) / 2
                                tm[sf][pi].features[1] = math.log(
                                    math.exp(tm[sf][pi].features[1]) +
                                    math.exp(p.features[1]))
                                tm[sf][pi].features[2] = max(
                                    [tm[sf][pi].features[2], p.features[2]])
                                tm[sf][pi].features[3] = max(
                                    [tm[sf][pi].features[3], p.features[3]])
                        if not found:
                            tm[sf].append(
                                phrase(p.english, [
                                    p.features[0] / 2,
                                    math.log(math.exp(p.features[1]) + 1),
                                    p.features[2], p.features[3]
                                ]))
                else:
                    for p in tm[f]:
                        tmptm.setdefault(sf, []).append(p)

    for f in tmptm:
        tm[f] = tmptm[f]

    return tm
예제 #10
0
def StdNm(nonstd=None):

    ## make a DataFrame indexed by standardized name and
    ## contains columns "FirstName", "OtherNames", "OtherNames1"
    std = pd.read_csv("csv/StandardNames.csv")
    std["FullName"] = std["LastName"] + std["FirstName"]
    std["FullName"].fillna(method="ffill", inplace=True)
    std.set_index("FullName", inplace=True)

    #### Simplified characters for LastName happen sometimes
    std["ConvLast"] = [
        sTR if pd.isnull(sTR) else
        mf.simplify(sTR) if mf.is_traditional(sTR) else mf.tradify(sTR)
        for sTR in std["LastName"]
    ]
    std["LastOth"] = std["LastName"].fillna(method="ffill") + std['OtherNames']
    std["ConvFst"] = std["ConvLast"].fillna(method="ffill") + std['FirstName']
    std["ConvOth"] = std["ConvLast"].fillna(method="ffill") + std['OtherNames']
    std.drop(["Details", "Studio", "LastName", "ConvLast"],
             axis=1,
             inplace=True)

    ## make a dataframe of {key: alternative names, value: standard names} with
    ## unique keys and overlapping values
    map_df = pd.DataFrame()

    for colName in list(std.columns):
        df = pd.DataFrame({"key": std[colName], "value": std.index})
        map_df = map_df.append(df, ignore_index=True)

    map_df.dropna(inplace=True)

    ## Standardize names in the given Series
    map_dict = map_df.set_index('key').to_dict()['value']

    def standardize_names(participant):
        if participant in map_dict:
            return map_dict[participant]
        else:
            return participant

    ans = nonstd.map(standardize_names)

    return ans
예제 #11
0
def extract_glove_embeddings():
    log("extract_glove_embeddings()...")

    _, word_to_index = read_list_file(word_file)
    word_list = []
    embedding_list = []
    with codecs.open(glove_file, "r", encoding="utf8") as f:
        for line in f:
            line = line.strip().split()
            word = mafan.simplify(line[0])
            if word not in word_to_index: continue
            embedding = np.array([float(i) for i in line[1:]])
            word_list.append(word)
            embedding_list.append(embedding)

    np.save(pretrained_word_file, word_list)
    np.save(pretrained_embedding_file, embedding_list)

    log(" %d pre-trained words\n" % len(word_list))
    return
예제 #12
0
def extract_glove_embeddings():
    log("extract_glove_embeddings()...")
    
    _, word_to_index = read_list_file(word_file)
    word_list = []
    embedding_list = []
    with codecs.open(glove_file, "r", encoding="utf8") as f:
        for line in f:
            line = line.strip().split()
            word = mafan.simplify(line[0])
            if word not in word_to_index: continue
            embedding = np.array([float(i) for i in line[1:]])
            word_list.append(word)
            embedding_list.append(embedding)
    
    np.save(pretrained_word_file, word_list)
    np.save(pretrained_embedding_file, embedding_list)
    
    log(" %d pre-trained words\n" % len(word_list))
    return
예제 #13
0
def process_language(ele):
    try:
        text_list = ele.text.split('\n')
        text_list = [replace_special(x) for x in text_list]
        languages = [guess_language(l) for l in text_list]
        en_index = [l == 'en' for l in languages]
        zh_index = [l == 'zh' for l in languages]
        en_lines = list(compress(text_list, en_index))
        zh_lines = list(compress(text_list, zh_index))
        if len(en_lines) == 0 or len(zh_lines) == 0:
            return None
        en_text = ' '.join(en_lines)
        if len(check_chinese(en_text)) > 0:
            return None
        zh_text = simplify(' '.join(zh_lines))
        if len(check_english(zh_text)) > 0:
            return None
        text_line = en_text + ' | ' + zh_text
        return text_line
    except:
        return None
예제 #14
0
파일: subtitles.py 프로젝트: lueo/angelsub
    def dup_check(self, subtitles):
        ''' Check if there are dplicated subtitles.

        :returns: subtitles Subtitles without duplication.
        '''
        # convert all encoding to utf-8, trad -> simp
        for sub in subtitles:
            enc = chardet.detect(sub['content'])['encoding']
            # for simp. chinese
            if enc == 'GB2312':
                sub['content'] = sub['content'].decode('gb18030')
            else:
                sub['content'] = sub['content'].decode(enc)
            # for big5, convert!
            if enc == 'Big5':
                sub['content'] = mafan.simplify(sub['content'])

        # find dups
        dup_tags = [False] * len(subtitles)
        for i in range(len(subtitles)):
            for j in range(i+1, len(subtitles)):
                sub_a = subtitles[i]['content']
                sub_b = subtitles[j]['content']
                sim = difflib.SequenceMatcher(None, sub_a, sub_b).real_quick_ratio()
                logging.debug('Similar ratio between %d and %d is %f' % (i, j, sim))
                if sim >= 0.9:
                    if len(sub_a) >= len(sub_b):
                        dup_tags[j] = True
                    else:
                        dup_tags[i] = True

        new_subs = [s for i, s in enumerate(subtitles) if dup_tags[i] == False]

        logging.debug('Total %d subtitles remains.' % len(new_subs)) 

        return new_subs
예제 #15
0
def get_candidates(inputfile,
                   tm,
                   lm,
                   weights,
                   stack_size=10,
                   nbest=None,
                   simpmode=True,
                   separate_unknown_words=False,
                   verbose=False):
    if nbest is None:
        nbest = stack_size

    print >> sys.stderr, "Decoding: " + inputfile
    print >> sys.stderr, "Reading input..."
    french = [line.strip().split()
              for line in open(inputfile).readlines()]  # list of list
    if simpmode:
        from mafan import simplify
        for li, line in enumerate(french):
            for wi, word in enumerate(line):
                french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8')

    # tm should translate unknown words as-is with a small probability
    # (i.e. only fallback to copying unknown words over as the last resort)
    for i in xrange(len(french)):
        j = 0
        while j < len(french[i]):
            word = french[i][j]
            if (word, ) not in tm:
                flag = True
                if len(word) >= 2 and separate_unknown_words:
                    for separate in xrange(1, len(word)):
                        if (word[:separate], ) in tm and (
                                word[separate:], ) in tm:
                            french[i][j] = word[:separate]
                            j += 1
                            french[i].insert(j, word[separate:])
                            flag = False
                            break
                if flag:
                    tm[(word, )] = [
                        models.phrase(word, [unknown_word_logprob] *
                                      number_of_features_PT)
                    ]
            j += 1

    print >> sys.stderr, "Start decoding..."
    for n, f in enumerate(french):
        if verbose:
            print >> sys.stderr, "Input: " + ' '.join(f)
        # Generate cache for phrase segmentations.
        f_cache = generate_phrase_cache(f, tm)
        # Pre-calculate future cost table
        future_cost_table = precalcuate_future_cost(
            f, tm, weights[:number_of_features_PT])

        # score = dot(features, weights)
        # features = sums of each log feature
        # predecessor = previous hypothesis
        # lm_state = N-gram state (the last one or two words)
        # last_frange = (i, j) the range of last translated phrase in f
        # phrase = the last TM phrase object (correspondence to f[last_frange])
        # coverage = bit string representing the translation coverage on f
        # future_cost = a safe estimation to be added to total_score
        hypothesis = namedtuple(
            "hypothesis",
            "score, features, lm_state, predecessor, last_frange, phrase, coverage, future_cost"
        )
        initial_hypothesis = hypothesis(0.0, [0.0] * number_of_features,
                                        lm.begin(), None, (0, 0), None, 0, 0)

        # stacks[# of covered words in f] (from 0 to |f|)
        stacks = [{} for _ in xrange(len(f) + 1)]
        # stacks[size][(lm_state, last_frange[1], coverage)]:
        # recombination based on (lm_state, last_frange[1], coverage).
        # For different hypotheses with the same tuple, keep the one with the higher score.
        # lm_state affects LM; last_frange affects distortion; coverage affects available choices.
        stacks[0][(lm.begin(), None, 0)] = initial_hypothesis

        for i, stack in enumerate(stacks[:-1]):
            if verbose:
                print >> sys.stderr, "Stack[%d]:" % i

            # Top-k pruning
            s_hypotheses = sorted(stack.values(),
                                  key=lambda h: h.score + h.future_cost,
                                  reverse=True)
            for h in s_hypotheses[:stack_size]:
                if verbose:
                    print >> sys.stderr, h.score, h.lm_state, bin(
                        h.coverage), ' '.join(f[h.last_frange[0]:h.
                                                last_frange[1]]), h.future_cost

                for (f_range, delta_coverage,
                     tm_phrases) in enumerate_phrases(f_cache, h.coverage):
                    # f_range = (i, j) of the enumerated next phrase to be translated
                    # delta_coverage = coverage of f_range
                    # tm_phrases = TM entries corresponding to fphrase f[f_range]
                    length = i + f_range[1] - f_range[0]
                    coverage = h.coverage | delta_coverage
                    distance = abs(f_range[0] - h.last_frange[1])
                    # if distance > max_distance and i < len(stacks) / 2:
                    #   continue

                    # TM might give us multiple candidates for a fphrase.
                    for phrase in tm_phrases:
                        features = h.features[:]  # copy!
                        # Features from phrase table
                        for fid in range(number_of_features_PT):
                            features[fid] += phrase.features[fid]
                        # log_lmprob (N-gram)
                        lm_state = h.lm_state
                        loglm = 0.0
                        for word in phrase.english.split():
                            (lm_state, word_logprob) = lm.score(lm_state, word)
                            loglm += word_logprob
                        # Don't forget the STOP N-gram if we just covered the whole sentence.
                        loglm += lm.end(lm_state) if length == len(f) else 0.0
                        features[4] += loglm
                        # log distortion (distance ** alpha)
                        features[5] += log(alpha) * distance
                        # length of the translation (-length)
                        features[6] += -len(phrase.english.split())

                        score = calculate_total_score(features, weights)
                        future_list = get_future_list(coverage, len(f))
                        future_cost = get_future_cost(future_list,
                                                      future_cost_table)

                        new_state = (lm_state, f_range[1], coverage)
                        new_hypothesis = hypothesis(score, features, lm_state,
                                                    h, f_range, phrase,
                                                    coverage, future_cost)
                        # Recombination
                        if new_state not in stacks[length] or \
                            score + future_cost > stacks[length][new_state].score + stacks[length][new_state].future_cost:
                            stacks[length][new_state] = new_hypothesis

        winners = sorted(stacks[len(f)].values(),
                         key=lambda h: h.score,
                         reverse=True)
        if nbest == 1:
            yield extract_english(winners[0])
        else:
            for s in winners[:nbest]:
                yield ("%d ||| %s |||" + " %f" * number_of_features) % \
                  ((n, extract_english(s)) + tuple(s.features))
    print >> sys.stderr, "Decoding completed"
예제 #16
0
def get_candidates(input, tm, lm, weights, s=1):

    alpha = 0.95  #reordering parameter
    french = [list(line.strip().split()) for line in open(input).readlines()]
    for li, line in enumerate(french):
        for wi, word in enumerate(line):
            french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8')

    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french, [])):
        if (word, ) not in tm:
            tm[(word, )] = [models.phrase(word, [0.0, 0.0, 0.0, 0.0])]

    def generate_phrase_cache(f):
        cache = []
        for i in range(0, len(f)):
            entries = []
            bitstring = 0
            for j in range(i + 1, len(f) + 1):
                bitstring += 1 << (len(f) - j)
                if tuple(f[i:j]) in tm:
                    entries.append({
                        'end': j,
                        'bitstring': bitstring,
                        'phrase': tm[tuple(f[i:j])]
                    })
            cache.append(entries)
        return cache

    def enumerate_phrases(f_cache, coverage):
        for i in range(0, len(f_cache)):
            bitstring = 0
            for entry in f_cache[i]:
                if (entry['bitstring'] & coverage) == 0:
                    yield ((i, entry['end']), entry['bitstring'],
                           entry['phrase'])

    def precalcuate_future_cost(f):
        phraseCheapestTable = {}
        futureCostTable = {}
        for i in range(0, len(f)):
            for j in range(i + 1, len(f) + 1):
                if f[i:j] in tm:
                    phraseCheapestTable[i, j] = -sys.maxint
                    for phrase in tm[f[i:j]]:
                        if phrase.logprob > phraseCheapestTable[i, j]:
                            phraseCheapestTable[i, j] = phrase.logprob
        for i in range(0, len(f)):
            futureCostTable[i, 1] = phraseCheapestTable[i, i + 1]
            for j in range(2, len(f) + 1 - i):
                if (i, i + j) in phraseCheapestTable:
                    futureCostTable[i, j] = phraseCheapestTable[i, i + j]
                else:
                    futureCostTable[i, j] = -sys.maxint
                for k in range(1, j):
                    if (((i + k, i + j) in phraseCheapestTable) and
                        (futureCostTable[i, j] < futureCostTable[i, k] +
                         phraseCheapestTable[i + k, i + j])):
                        futureCostTable[i, j] = futureCostTable[
                            i, k] + phraseCheapestTable[i + k, i + j]
        return futureCostTable

    def get_future_list(bitstring):
        bitList = bin(bitstring)[2:]
        futureList = []
        count = 0
        index = 0
        findZeroBit = False
        for i in range(len(bitList)):
            if bitList[i] == '0':
                if not findZeroBit:
                    index = i
                findZeroBit = True
                count = count + 1
            else:
                if findZeroBit:
                    futureList.append((index, count))
                findZeroBit = False
                count = 0
        if findZeroBit:
            futureList.append((index, count))
        return futureList

    def get_future_cost(bitList, futureCostTable):
        cost = 0
        for item in bitList:
            cost = cost + futureCostTable[item]
        return cost

    def extract_english(h):
        return "" if h.predecessor is None else "%s%s " % (extract_english(
            h.predecessor), h.phrase.english)

    results = []
    sys.stderr.write("Decoding %s...\n" % (input, ))
    for n, f in enumerate(french):
        # Generate cache for phrase segmentations.
        f_cache = generate_phrase_cache(f)
        # Pre-calculate future cost table
        #future_cost_table = precalcuate_future_cost(f)

        # logprob = log_lmprob + log_tmprob + distortion_penalty
        # predecessor = previous hypothesis
        # lm_state = N-gram state (the last one or two words)
        # last_frange = (i, j) the range of last translated phrase in f
        # phrase = the last TM phrase object (correspondence to f[last_frange])
        # coverage = bit string representing the translation coverage on f
        # future_cost
        hypothesis = namedtuple(
            "hypothesis",
            "logprob, features, lm_score, lm_state, predecessor, last_frange, phrase, coverage"
        )
        initial_hypothesis = hypothesis(0.0, [0.0, 0.0, 0.0, 0.0], 0.0,
                                        lm.begin(), None, (0, 0), None, 0)
        # stacks[# of covered words in f] (from 0 to |f|)
        stacks = [{} for _ in range(len(f) + 1)]
        # stacks[size][(lm_state, last_frange, coverage)]:
        # recombination based on (lm_state, last_frange, coverage).
        # For different hypotheses with the same tuple, keep the one with the higher logprob.
        # lm_state affects LM; last_frange affects distortion; coverage affects available choices.
        stacks[0][(lm.begin(), None, 0)] = initial_hypothesis
        for i, stack in enumerate(stacks[:-1]):

            # Top-k pruning
            for h in sorted(stack.itervalues(), key=lambda h: -h.logprob)[:s]:
                for (f_range, delta_coverage,
                     tm_phrases) in enumerate_phrases(f_cache, h.coverage):
                    # f_range = (i, j) of the enumerated next phrase to be translated
                    # delta_coverage = coverage of f_range
                    # tm_phrases = TM entries corresponding to fphrase f[f_range]
                    length = i + f_range[1] - f_range[0]
                    coverage = h.coverage | delta_coverage
                    distance = f_range[0] - h.last_frange[1]

                    # TM might give us multiple candidates for a fphrase.
                    for phrase in tm_phrases:
                        # log_tmprob and distortion
                        features = map(add, h.features, phrase.features)
                        # log_lmprob (N-gram)
                        lm_state = h.lm_state
                        lm_score = h.lm_score
                        for word in phrase.english.split():
                            (lm_state, word_logprob) = lm.score(lm_state, word)
                            lm_score += word_logprob
                        # Don't forget the STOP N-gram if we just covered the whole sentence.
                        lm_score += lm.end(lm_state) if length == len(
                            f) else 0.0

                        # Future cost.
                        #future_list = get_future_list(delta_coverage)
                        #future_cost = get_future_cost(future_list, future_cost_table)

                        logprob = sum(
                            p * q
                            for p, q in zip((features + [lm_score]), weights))
                        new_state = (lm_state, f_range, coverage)
                        new_hypothesis = hypothesis(logprob, features,
                                                    lm_score, lm_state, h,
                                                    f_range, phrase, coverage)
                        if new_state not in stacks[length] or \
                            logprob > stacks[length][new_state].logprob:  # recombination
                            stacks[length][new_state] = new_hypothesis

        winner = sorted(stacks[len(f)].itervalues(),
                        key=lambda h: h.logprob,
                        reverse=True)[0:100]
        for i in range(len(winner)):
            results += [
                "%d ||| %s ||| %f %f %f %f %f" %
                (n, extract_english(winner[i]), winner[i].features[0],
                 winner[i].features[1], winner[i].features[2],
                 winner[i].features[3], winner[i].lm_score)
            ]

    return results
예제 #17
0
from mafan import encoding
from mafan import text
from mafan import simplify, tradify
from mafan import split_text
import mafan
from mafan import pinyin

# 对包含其他编码的文本转化成utf8格式
# filename = 'test.txt'  # name or path of file as string
# encoding.convert(filename)  # creates a file with name 'ugly_big5_utf-8.txt' in glorious utf-8 encoding

# 简体和繁体转化
print('-' * 50)
string = u'这是麻烦啦'
print(tradify(string))  # convert string to traditional
print(simplify(tradify(string)))  # convert back to simplified

# 是否包含符号或者拉丁字符
print('-' * 50)
flag = text.has_punctuation(u'这是麻烦啦')
print(flag)
flag = text.has_punctuation(u'这是麻烦啦.')
print(flag)
flag = text.has_punctuation(u'这是麻烦啦。')
print(flag)
flag = text.contains_latin(u'这是麻烦啦。')
print(flag)
flag = text.contains_latin(u'You are麻烦啦。')
print(flag)

# 判断是否是简体还是繁体
예제 #18
0
def join_elements(candidate):
    seen = set()
    for name in candidate:
        name = simplify(''.join(name.split()))
        seen.add(name)
    return seen
예제 #19
0
def simplify():
    text = request.args.get('text')
    d = {'text': mafan.simplify(text)}
    return jsonify(**d)
    line = fp.readline()
    while line:
        content = json.loads(line)

        text_a = content.get("question")
        text_b = content.get("answer")
        label = content.get("yesno_answer")
        text_a = text_a.replace(' ', '')
        text_b = text_b.replace(' ', '')
        text_a = text_a.replace('\t', '')
        text_b = text_b.replace('\t', '')
        text_a = text_a.replace('?', '?')
        text_b = text_b.replace('?', '?')
        if text_a[-1] != "?":
            text_a = text_a + "?"
        text_a = mafan.simplify(text_a)
        text_b = mafan.simplify(text_b)

        if label == "" and filename != "test":
            print("miss label")
            label = "Yes"
        '''if filename != "test":
            out_line = text_a + '\t' + text_b + '\t' + label
        else:
            out_line = text_a + '\t' + text_b'''
        out_line = [text_a, text_b]
        out_arr.append(out_line)
        length_arr.append(len(text_a + text_b))
        line = fp.readline()
    print(filename + " aver length is:%d", sum(length_arr) / len(length_arr))
    print(filename + " max length is:%d", max(length_arr))
예제 #21
0
def convert():
    return simplify(request.args.get("word"))