def split_zh_en (zh_en_str): tokenizer = Tokenizer() mark = {"en":1, "zh":2} zh_en_group = [] zh_set = [] en_set = [] status = "" en = "" zh = "" for c in zh_en_str: if tokenizer.is_zh(c): if status == 'en': zh_en_group.append ([mark["en"], ''.join(en_set)]) en += ''.join(en_set) en_set = [] zh_set.append(c) status = 'zh' else: if status == 'zh': zh_en_group.append ([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) zh_set = [] en_set.append(c) status = 'en' if en_set: zh_en_group.append ([mark["en"], ''.join(en_set)]) en += ''.join(en_set) elif zh_set: zh_en_group.append ([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) if en == "": print 'error' return zh_en_group, en, zh
def split_zh_en(zh_en_str): tokenizer = Tokenizer() mark = {"en": 1, "zh": 2} zh_en_group = [] zh_set = [] en_set = [] status = "" en = "" zh = "" for c in zh_en_str: if tokenizer.is_zh(c): if status == 'en': zh_en_group.append([mark["en"], ''.join(en_set)]) en += ''.join(en_set) en_set = [] zh_set.append(c) status = 'zh' else: if status == 'zh': zh_en_group.append([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) zh_set = [] en_set.append(c) status = 'en' if en_set: zh_en_group.append([mark["en"], ''.join(en_set)]) en += ''.join(en_set) elif zh_set: zh_en_group.append([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) if en == "": print 'error' return zh_en_group, en, zh
import os from my_class.Document import Document from my_class.Tokenizer import Tokenizer from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8'))
import os from my_class.Document import Document from my_class.Tokenizer import Tokenizer from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__=='__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8'))