Exemplo n.º 1
0
def seg_txt_search(txt):
    result = []
    buffer = []
    def _():
        if len(buffer) > 1:
            result.extend(word_len2("".join(buffer)))
        elif buffer:
            if "一" <= buffer[0] <= "龥":
                if buffer[0] not in SMALLCHAR:
                    result.append(buffer[0])

    for word in seg_txt(txt):
        word = word.decode("utf-8", "ignore")
        if len(word) == 1:
            buffer.append(word)
        else:
            _()
            buffer = []
            if len(word) <= 16:
                word = word.lower()
                utf8_word = word.encode("utf-8", "ignore")
                if utf8_word.isalnum():
                    result.append(word)
                elif len(word) <= 2:
                    result.append(utf8_word)
                else:
                    result.extend(word_len2(word))

    _()

    result = [i.encode("utf-8", "ignore") if type(i) is str else i for i in result]

    return result
Exemplo n.º 2
0
def seg_txt2(txt):
    for i in seg_txt(txt):
        i = i.lower()
        if len(i) > 3:
            yield i
        else:
            i = i.decode("utf-8","ignore")
            if len(i) == 1:
                if "一" <= i <= "龥" and i not in STOPWORD:
                    yield i
            else:
                yield i
Exemplo n.º 3
0
def seg_title_search(txt):
    result = []
    buffer = []
    for word in seg_txt(txt):
        word = word.decode("utf-8", "ignore")

        if len(word) == 1:
            buffer.append(word)
        else:
            for i in buffer:
                result.append(i)
            if len(buffer) > 1:
                result.extend(word_len2("".join(buffer)))
            buffer = []
            if len(word) <= 16:
                word = word.lower()
                utf8_word = word.encode("utf-8", "ignore")
                if utf8_word.isalnum():
                    result.append(word)
                else:
                    for i in word:
                        result.append(i)
                    if len(word) <= 2:
                        result.append(utf8_word)
                    else:
                        result.extend(word_len2(word))

    if len(buffer) > 1:
        result.extend(word_len2("".join(buffer)))
    elif buffer:
        if u"一" <= buffer[0] <= u"龥":
            if buffer[0] not in SMALLCHAR:
                result.append(buffer[0])

    result = [
        i.encode("utf-8", "ignore") if type(i) is unicode else i
        for i in result
    ]
    #    txt = txt.decode("utf-8", "ignore")

    return result
Exemplo n.º 4
0
Arquivo: search.py Projeto: 42qu/mmseg
def seg_title_search(txt):
    result = []
    buffer = []
    for word in seg_txt(txt):
        word = word.decode("utf-8", "ignore")

        if len(word) == 1:
            buffer.append(word)
        else:
            for i in buffer:
                result.append(i)
            if len(buffer) > 1:
                result.extend(word_len2("".join(buffer)))
            buffer = []
            if len(word) <= 16:
                word = word.lower()
                utf8_word = word.encode("utf-8", "ignore")
                if utf8_word.isalnum():
                    result.append(word)
                else:
                    for i in word:
                        result.append(i)
                    if len(word) <= 2:
                        result.append(utf8_word)
                    else:
                        result.extend(word_len2(word))

    if len(buffer) > 1:
        result.extend(word_len2("".join(buffer)))
    elif buffer:
        if u"一" <= buffer[0] <= u"龥":
            if buffer[0] not in SMALLCHAR:
                result.append(buffer[0])


    result = [i.encode("utf-8", "ignore") if type(i) is unicode else i for i in result]
#    txt = txt.decode("utf-8", "ignore")

    return result