def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n\n", example[0])[1].split(' . ') title = re.split("\n\n", example[0])[0] abstract = re.split("\n\n", example[0])[2] #remove too short sentences df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) preprocessed_abs_sentences_list = [] raw_abs_sent_list = abstract.split(' . ') for abs_sent in raw_abs_sent_list: preprocessed_abs_sent = preprocess_raw_sent(abs_sent) preprocessed_abs_sentences_list.append(preprocessed_abs_sent) preprocessed_abs_sentences = ( " ").join(preprocessed_abs_sentences_list) if len(preprocessed_sentences) < 7 or len( preprocessed_abs_sentences_list) < 3: continue rougeforsentences = evaluate_rouge(raw_sentences, abstract) print("Done preprocessing!") print('time for processing', time.time() - start_time) if len(preprocessed_sent) < 4: NUM_PICKED_SENTS = len(preprocessed_sentences) else: NUM_PICKED_SENTS = 4 # DONE! Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, rougeforsentences, abstract, order_params) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) print(file_name) if best_individual is None: solution_for_exception(rougeforsentences, raw_sentences, file_name) else: print(best_individual) Solver.show(best_individual, file_name)
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n\n", example[0])[1].split(' . ') title = re.split("\n\n", example[0])[0] abstract = re.split("\n\n", example[0])[2] #remove too short sentences df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) preprocessed_abs_sentences_list = [] raw_abs_sent_list = abstract.split(' . ') for abs_sent in raw_abs_sent_list: preprocessed_abs_sent = preprocess_raw_sent(abs_sent) preprocessed_abs_sentences_list.append(preprocessed_abs_sent) preprocessed_abs_sentences = ( " ").join(preprocessed_abs_sentences_list) if len(preprocessed_sentences) < 7 or len( preprocessed_abs_sentences_list) < 3: continue rougeforsentences = evaluate_rouge(raw_sentences, abstract) rank_rougeforsentences = sorted(rougeforsentences, key=lambda x: x[1], reverse=True) length_of_summary = int(0.2 * len(raw_sentences)) rank_rouge = rank_rougeforsentences[:length_of_summary] rank_rouge = sorted(rank_rouge, key=lambda x: x[2], reverse=False) print("Done preprocessing!") print('time for processing', time.time() - start_time) file_name = os.path.join(save_path, example[1]) f = open(file_name, 'w', encoding='utf-8') for sent in rank_rouge: f.write(sent[0] + ' ') f.close()
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: file_name = os.path.join(save_path, example[1]) start_time = time.time() raw_sents = re.split(" . ", example[0]) #remove too short sentences df = pd.DataFrame(raw_sents, columns =['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) < 5: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) if len(preprocessed_sentences) < 10: solution_for_exception(raw_sentences, file_name) title = preprocessed_sentences[0] vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(preprocessed_sentences) feature_names = vectorizer.get_feature_names() dense = vectors.todense() list_sentences_frequencies = dense.tolist() # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names) title_vector = list_sentences_frequencies[0] #tfidf for document and abstract document = [(" ").join(preprocessed_sentences)] vector_doc = vectorizer.fit_transform(document) dense_doc = vector_doc.todense() document_vector = dense_doc.tolist()[0] number_of_nouns = count_noun(raw_sentences, option= True) simWithTitle = sim_with_title(list_sentences_frequencies, title_vector) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector) NUM_PICKED_SENTS = 4 print("Done preprocessing!") print('time for processing', time.time() - start_time) Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.solve() print(file_name) if best_individual is None: solution_for_exception(raw_sentences, file_name) else: print(best_individual) Solver.show(best_individual, file_name)
def sim_with_title_of_paragraph(document): paragraphs = document.split('\n') sim_header = [] raw_sents = [] preprocessed_sents = [] for para in paragraphs: raw = para.split(' . ') df = pd.DataFrame(raw, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] newdf['preprocessed_raw'] = df['preprocess_raw'].apply( lambda x: preprocess_raw_sent(x)) raw_sentences = newdf['preprocess_raw'].values.tolist() preprocessed_sentences = newdf['preprocessed_raw'].values.tolist() # preprocessed_sentences = [] # for raw_sent in raw_sentences: # preprocessed_sent = preprocess_raw_sent(raw_sent) # preprocessed_sentences.append(preprocessed_sent) raw_sents.extend(raw_sentences) preprocessed_sents.extend(preprocessed_sentences) #similar with header of paragraph vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(preprocessed_sentences) feature_names = vectorizer.get_feature_names() dense = vectors.todense() denselist = dense.tolist() df_tfidf = pd.DataFrame(denselist, columns=feature_names) simWithTitle = sim_with_title(denselist, denselist[0]) sim_header.extend(simWithTitle) del df del newdf return raw_sents, preprocessed_sents, sim_header
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() raw_doc = re.split("\n\n", example[0])[1] title = re.split("\n\n", example[0])[0] abstract = re.split("\n\n", example[0])[2] #remove too short sentences # df = pd.DataFrame(raw_sents, columns =['raw']) # df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) # newdf = df.loc[(df['preprocess_raw'] != 'None')] # raw_sentences = newdf['preprocess_raw'].values.tolist() # preprocessed_sentences = [] # for raw_sent in raw_sentences: # preprocessed_sent = preprocess_raw_sent(raw_sent) # preprocessed_sentences.append(preprocessed_sent) raw_sentences, preprocessed_sentences, simWithTitle = sim_with_title_of_paragraph( raw_doc) if len(raw_sentences) == 0: continue preprocessed_abs_sentences_list = [] raw_abs_sent_list = abstract.split(' . ') for abs_sent in raw_abs_sent_list: preprocessed_abs_sent = preprocess_raw_sent(abs_sent) preprocessed_abs_sentences_list.append(preprocessed_abs_sent) if len(preprocessed_abs_sentences_list) < 4 or len( preprocessed_sentences) < 7: continue preprocessed_abs_sentences = ( " ").join(preprocessed_abs_sentences_list) #tfidf for sentences bodyandtitle = preprocessed_sentences.copy() bodyandtitle.append(preprocess_raw_sent(title.lower())) full_text = preprocessed_sentences.copy() full_text.append(preprocessed_abs_sentences) full_text.append(preprocess_raw_sent(title.lower())) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(full_text) feature_names = vectorizer.get_feature_names() dense = vectors.todense() denselist = dense.tolist() df_tfidf = pd.DataFrame(denselist, columns=feature_names) title_vector = denselist[-1] #tfidf for document and abstract document = [(" ").join(bodyandtitle), preprocessed_abs_sentences] vector_doc = vectorizer.fit_transform(document) dense_doc = vector_doc.todense() document_vector = dense_doc.tolist()[0] abstract_vector = dense_doc.tolist()[1] list_sentences_frequencies = denselist[:-2] # number_of_nouns = count_noun(preprocessed_sentences, option = True) number_of_nouns = 0 # simWithTitle = sim_with_title(list_sentences_frequencies, title_vector) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector) simWithAbs = sim_with_doc(list_sentences_frequencies, abstract_vector) rougeforsentences = evaluate_rouge(raw_sentences, abstract) print("Done preprocessing!") print('time for processing', time.time() - start_time) if len(preprocessed_sentences) < 4: NUM_PICKED_SENTS = len(preprocessed_sentences) else: NUM_PICKED_SENTS = 4 # DONE! Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, simWithAbs, rougeforsentences, order_params) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)
def main(): # Setting Variables POPU_SIZE = 30 MAX_GEN = 20 CROSS_RATE = 0.8 MUTATE_RATE = 0.4 NUM_PICKED_SENTS = 4 directory = 'stories' save_path = 'hyp' print("Setting: ") print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("CROSSING RATE: {}".format(CROSS_RATE)) print("MUTATION SIZE: {}".format(MUTATE_RATE)) # list of documents stories = load_docs(directory) start_time = time.time() for example in stories: try: raw_sents = example[0].split(" . ") print("Preprocessing ", example[1]) sentences = [] sentences_for_NNP = [] df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) sent_tmp = preprocess_numberOfNNP(raw_sent) # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s') sentences.append(sent) sentences_for_NNP.append(sent_tmp) title_raw = raw_sentences[0] title = preprocess_raw_sent(title_raw) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(sentences, title) sim2sents = sim_2_sent(sentences) simWithDoc = [] for sent in sentences: simWithDoc.append(sim_with_doc(sent, sentences)) print("Done preprocessing!") # DONE! Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name) except Exception as e: print(example[1]) print("type error: " + str(e)) print("--- %s mins ---" % ((time.time() - start_time) / (60.0 * len(stories))))
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params, scheme): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue title_raw = raw_sentences[0] sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) # POPU_SIZE = 40 if len(sentences) < 20: MAX_GEN = 20 elif len(sentences) < 50: MAX_GEN = 50 else: MAX_GEN = 80 print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: NUM_PICKED_SENTS = 4 MinLT = 1 MaxLT = 7 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params, MinLT, MaxLT, scheme) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)
def start_run(processID, sub_stories, save_path, word_embeddings): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n\n", example[0])[1].split(' . ') title = re.split("\n\n", example[0])[0] abstract = re.split("\n\n", example[0])[2] #remove too short sentences df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) preprocessed_abs_sentences_list = [] raw_abs_sent_list = abstract.split(' . ') for abs_sent in raw_abs_sent_list: preprocessed_abs_sent = preprocess_raw_sent(abs_sent) preprocessed_abs_sentences_list.append(preprocessed_abs_sent) preprocessed_abs_sentences = ( " ").join(preprocessed_abs_sentences_list) if len(preprocessed_sentences) < 7 or len( preprocessed_abs_sentences_list) < 3: continue sentences = preprocessed_sentences.copy() sentence_vectors = [] for i in sentences: if len(i) != 0: v = sum([ word_embeddings.get(w, np.zeros((50, ))) for w in i.split() ]) / (len(i.split()) + 0.001) else: v = np.zeros((50, )) sentence_vectors.append(v) sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 50), sentence_vectors[j].reshape(1, 50))[0, 0] nx_graph = nx.from_numpy_array(sim_mat) try: scores = nx.pagerank(nx_graph) # score of all sentences in article except Exception: continue scores_with_sentences = [] for i in range(len(raw_sentences)): tmp = (raw_sentences[i], scores[i], i) scores_with_sentences.append(tmp) rank_scores_with_sentences = sorted(scores_with_sentences, key=lambda x: x[1], reverse=True) length_of_summary = int(0.2 * len(raw_sentences)) rank_text = rank_scores_with_sentences[:length_of_summary] rank_text = sorted(rank_text, key=lambda x: x[2], reverse=False) print("Done preprocessing!") print('time for processing', time.time() - start_time) file_name = os.path.join(save_path, example[1]) f = open(file_name, 'w', encoding='utf-8') for sent in rank_text: f.write(sent[0] + ' ') f.close()
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() # raw_sentences = re.split("\n\s+", example[0]) raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue # print('raw', len(raw_sentences), stories.index(example)) title_raw = raw_sentences[0] # Preprocessing # print("Preprocessing...") sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: # NUM_PICKED_SENTS = x # NUM_PICKED_SENTS = int(len(sentences)*0.2) NUM_PICKED_SENTS = 4 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)