def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: file_name = os.path.join(save_path, example[1]) start_time = time.time() raw_sents = re.split(" . ", example[0]) #remove too short sentences df = pd.DataFrame(raw_sents, columns =['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) < 5: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) if len(preprocessed_sentences) < 10: solution_for_exception(raw_sentences, file_name) title = preprocessed_sentences[0] vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(preprocessed_sentences) feature_names = vectorizer.get_feature_names() dense = vectors.todense() list_sentences_frequencies = dense.tolist() # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names) title_vector = list_sentences_frequencies[0] #tfidf for document and abstract document = [(" ").join(preprocessed_sentences)] vector_doc = vectorizer.fit_transform(document) dense_doc = vector_doc.todense() document_vector = dense_doc.tolist()[0] number_of_nouns = count_noun(raw_sentences, option= True) simWithTitle = sim_with_title(list_sentences_frequencies, title_vector) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector) NUM_PICKED_SENTS = 4 print("Done preprocessing!") print('time for processing', time.time() - start_time) Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.solve() print(file_name) if best_individual is None: solution_for_exception(raw_sentences, file_name) else: print(best_individual) Solver.show(best_individual, file_name)
def sim_with_title_of_paragraph(document): paragraphs = document.split('\n') sim_header = [] raw_sents = [] preprocessed_sents = [] for para in paragraphs: raw = para.split(' . ') df = pd.DataFrame(raw, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] newdf['preprocessed_raw'] = df['preprocess_raw'].apply( lambda x: preprocess_raw_sent(x)) raw_sentences = newdf['preprocess_raw'].values.tolist() preprocessed_sentences = newdf['preprocessed_raw'].values.tolist() # preprocessed_sentences = [] # for raw_sent in raw_sentences: # preprocessed_sent = preprocess_raw_sent(raw_sent) # preprocessed_sentences.append(preprocessed_sent) raw_sents.extend(raw_sentences) preprocessed_sents.extend(preprocessed_sentences) #similar with header of paragraph vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(preprocessed_sentences) feature_names = vectorizer.get_feature_names() dense = vectors.todense() denselist = dense.tolist() df_tfidf = pd.DataFrame(denselist, columns=feature_names) simWithTitle = sim_with_title(denselist, denselist[0]) sim_header.extend(simWithTitle) del df del newdf return raw_sents, preprocessed_sents, sim_header
def main(): # Setting Variables POPU_SIZE = 30 MAX_GEN = 20 CROSS_RATE = 0.8 MUTATE_RATE = 0.4 NUM_PICKED_SENTS = 4 directory = 'stories' save_path = 'hyp' print("Setting: ") print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("CROSSING RATE: {}".format(CROSS_RATE)) print("MUTATION SIZE: {}".format(MUTATE_RATE)) # list of documents stories = load_docs(directory) start_time = time.time() for example in stories: try: raw_sents = example[0].split(" . ") print("Preprocessing ", example[1]) sentences = [] sentences_for_NNP = [] df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) sent_tmp = preprocess_numberOfNNP(raw_sent) # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s') sentences.append(sent) sentences_for_NNP.append(sent_tmp) title_raw = raw_sentences[0] title = preprocess_raw_sent(title_raw) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(sentences, title) sim2sents = sim_2_sent(sentences) simWithDoc = [] for sent in sentences: simWithDoc.append(sim_with_doc(sent, sentences)) print("Done preprocessing!") # DONE! Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name) except Exception as e: print(example[1]) print("type error: " + str(e)) print("--- %s mins ---" % ((time.time() - start_time) / (60.0 * len(stories))))
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params, scheme): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue title_raw = raw_sentences[0] sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) # POPU_SIZE = 40 if len(sentences) < 20: MAX_GEN = 20 elif len(sentences) < 50: MAX_GEN = 50 else: MAX_GEN = 80 print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: NUM_PICKED_SENTS = 4 MinLT = 1 MaxLT = 7 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params, MinLT, MaxLT, scheme) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() # raw_sentences = re.split("\n\s+", example[0]) raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue # print('raw', len(raw_sentences), stories.index(example)) title_raw = raw_sentences[0] # Preprocessing # print("Preprocessing...") sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: # NUM_PICKED_SENTS = x # NUM_PICKED_SENTS = int(len(sentences)*0.2) NUM_PICKED_SENTS = 4 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)