def phrase_detection(bi_gram, file_name): lines = [line for line in StoreHelper.read_file(file_name).splitlines()] result = [] for line in lines: for y in SegmentHelper.lemmatization(SegmentHelper.segment_text(line)): if len(y) > 0: result.append(y) return bi_gram[result]
def generate_sentence_stream(): sentence_stream = [] for i in range(8535): #8535 text_file = "../data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) file_content = StoreHelper.read_file(text_file) for line in file_content.splitlines(): sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) StoreHelper.store_data(sentence_stream, 'sentence_stream.dat') return sentence_stream
def _remove_conjunction_segment(self, probability_dict): phase_list = [] sentence_list = [] word_list = SegmentHelper.segment_text(self.raw_position) word_group = [] for word in word_list: if word in stopwords.words('english'): if len(word_group) > 0: sentence_list.append(' '.join(word_group)) word_group = [] else: word_group.append(word) if len(word_group) > 0: sentence_list.append(' '.join(word_group)) for sentence in sentence_list: phase_list.extend( SegmentHelper.phase_segment(probability_dict, sentence, 0.05)) return phase_list
def get_frequency_dict(content): words_list = [] for line in content.splitlines(): words_list.extend( SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) return DictHelper.dict_from_count_list(words_list)
def generate_word_list(self): words_list = [] for line in self.raw_position.splitlines(): words_list.extend( SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) return words_list