Exemplo n.º 1
0
 def generate_sentence_stream():
     sentence_stream = []
     for i in range(8535): #8535
         text_file = "../data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("Working on %s" % text_file)
             file_content = StoreHelper.read_file(text_file)
             for line in file_content.splitlines():
                 sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line)))
     StoreHelper.store_data(sentence_stream, 'sentence_stream.dat')
     return sentence_stream
Exemplo n.º 2
0
 def generate_phrase_dict():
     sentence_stream = StoreHelper.load_data('sentence_stream.dat', [])
     phrases = Phrases(sentence_stream, min_count=2, threshold=2)
     bi_gram = Phraser(phrases)
     for i in range(8535):
         text_file = "../data/clean_post_lemmatize/%04d.dat" % i
         output_file = "../data/gensim_split/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("Working on %s" % text_file)
             phrase_list = GensimHelper.phrase_detection(bi_gram, text_file)
             phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list]
             StoreHelper.store_data(phrase_list, output_file)
Exemplo n.º 3
0
 def run_lemmatize(src_folder, dst_folder):
     for i in range(8535):
         input_file = path.join(src_folder, "%04d.dat" % i)
         output_file = path.join(dst_folder, "%04d.dat" % i)
         if StoreHelper.is_file_exist(input_file):
             file_content = StoreHelper.read_file(input_file)
             new_content = [
                 SegmentHelper.normalize(line)
                 for line in file_content.splitlines()
             ]
             StoreHelper.save_file(os.linesep.join(new_content),
                                   output_file)
         else:
             print("%s not exist!" % input_file)