def generate_sentence_stream(): sentence_stream = [] for i in range(8535): #8535 text_file = "../data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) file_content = StoreHelper.read_file(text_file) for line in file_content.splitlines(): sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) StoreHelper.store_data(sentence_stream, 'sentence_stream.dat') return sentence_stream
def generate_phrase_dict(): sentence_stream = StoreHelper.load_data('sentence_stream.dat', []) phrases = Phrases(sentence_stream, min_count=2, threshold=2) bi_gram = Phraser(phrases) for i in range(8535): text_file = "../data/clean_post_lemmatize/%04d.dat" % i output_file = "../data/gensim_split/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) phrase_list = GensimHelper.phrase_detection(bi_gram, text_file) phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list] StoreHelper.store_data(phrase_list, output_file)
def run_lemmatize(src_folder, dst_folder): for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) output_file = path.join(dst_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content = StoreHelper.read_file(input_file) new_content = [ SegmentHelper.normalize(line) for line in file_content.splitlines() ] StoreHelper.save_file(os.linesep.join(new_content), output_file) else: print("%s not exist!" % input_file)