Python has_enough_alpha 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.pattern_utils

메소드/함수: has_enough_alpha

hotexamples.com에서의 예제들: 5

Python has_enough_alpha - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.pattern_utils.has_enough_alpha에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: doc_utils.py 프로젝트: hapoyige/ARL-Adv

 def is_valid_word(word):
     word = word.lower().strip()
     not_too_short = (len(word) >= len_min if len_min else True)
     not_too_long = (len(word) <= len_max if len_max else True)
     len_valid = not_too_short and not_too_long
     is_stop = word in stop_corpus if stop_corpus else False
     enough_alpha = pu.has_enough_alpha(
         word, alpha_thres) if alpha_thres else True
     return len_valid and enough_alpha and not is_stop

예제 #2

파일 보기

파일: data_maker.py 프로젝트: locta66/TweetEventDetection

def extract_bad_tweets_into(files, output_file):
    total_tw_num = 0
    neg_twarr = list()
    for file in files:
        twarr = fu.load_array(file)
        total_tw_num += len(twarr)
        for tw in twarr:
            text = tw[tk.key_text]
            if len(text) < 20 or not pu.has_enough_alpha(text, 0.6):
                neg_twarr.append(tw)
    fu.dump_array(output_file, neg_twarr)
    return len(neg_twarr), total_tw_num

예제 #3

파일 보기

파일: back_filter.py 프로젝트: locta66/TweetEventDetection

def filter_twarr_text(twarr):
    """ This function only suits for tweets that are not processed """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr

예제 #4

파일 보기

def filter_twarr_text(twarr):
    """
    对输入的推特列表，对所有推特的文本进行预处理，抛弃预处理结果不合规的推特；
    每条推特 tk.key_orgntext 字段保留原始文本， tk.key_text 字段保留预处理结果
    :param twarr: list，推特列表
    :return: list，经过文本预处理以及筛选的推特列表
    """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_text, '').strip()
        # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(
                text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr

예제 #5

파일 보기

파일: Main2Parser.py 프로젝트: locta66/TweetEventDetection


def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)


if __name__ == '__main__':
    # merge_events_2016()
    import utils.pattern_utils as pu
    base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/"
    files = fi.listchildren(base, fi.TYPE_FILE, concat=True)
    for file in files:
        twarr = fu.load_array(file)
        len_pre = len(twarr)
        for idx in range(len(twarr) - 1, -1, -1):
            text = twarr[idx][tk.key_text]
            if not pu.has_enough_alpha(text, 0.6):
                print(text)
                twarr.pop(idx)
        print(len_pre, '->', len(twarr), '\n\n')
        # fu.dump_array(file, twarr)