def main(): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] print("Fetching encoder model...", end=" ") enc_model = SentenceTransformer('bert-base-nli-mean-tokens') print("Done") for full_text, catch_phrases in gt[:20]: # Embed each sentence sentence_embeddings = enc_model.encode(full_text) # Cluster each embedding cluster_n = 11 clusters = cluster(sentence_embeddings, minimum_samples=cluster_n) centroids = [] for idx in range(cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) # Select representative cluster closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, sentence_embeddings) ordering = sorted(range(cluster_n), key=lambda k: centroids[k]) summary = '.'.join([full_text[closest[idx]] for idx in ordering]).replace('\n', ' ') model_sum.append(summary) gt_sum.append(".".join(catch_phrases)) print("ROUGE score: {}".format(evaluate(model_sum, gt_sum)))
def main(): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] doc_n = len(gt) nb_dataset = [] for full_text, catch_phrases in gt[:500]: texts = [re.sub(r'^(\d+) (.*)', r'\2', text) for text in full_text] legal_text = " ".join(texts) legal_text = generateParagraphs(legal_text) legal_class = catch_phrases[-1] nb_dataset.append((legal_text, catch_phrases[-1],)) class_model, cv, legal_classes = nbTrain(nb_dataset) for full_text, catch_phrases in gt[:20]: texts = [re.sub(r'^(\d+) (.*)', r'\2', text) for text in full_text] legal_text = " ".join(texts) legal_text = generateParagraphs(legal_text) legal_text = cleanText(legal_text) text_cv = cv.transform([legal_text]) legal_class = class_model.predict(text_cv) gt_legal_class = catch_phrases[-1] print(legal_class[0], gt_legal_class, legal_class[0] == legal_classes[gt_legal_class])
def parseText(): """ Returns the headings of whole text :return: void """ gt = getGroundTruth() for full_text, catch_phrases in gt[:100]: paragraphs, headings = generateParagraph(full_text) for heading in headings[:3]: print(heading) print("="*20)
def train(self): """ Trains a classifier for the legal text :return: """ gt = getGroundTruth() nb_dataset = [] for full_text, catch_phrases in gt[:500]: legal_text = self.preprocess(full_text) nb_dataset.append(( legal_text, catch_phrases[-1], )) self.nbTrain(nb_dataset)
def main(self): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] doc_n = len(gt) for doc_idx in range(20): print("{}/{}".format(doc_idx, doc_n)) full_text, catch_phrases = gt[doc_idx] summary = self.getSentenceSummary(full_text) model_sum.append(summary) gt_sum.append(".".join(catch_phrases)) print("ROUGE score: {}".format(self.evaluate(model_sum, gt_sum)))
def getConclusion(self): """ Returns the last catch phrase of every doc :return: void """ gt = getGroundTruth() conclusion_freq = {} for full_text, catch_phrases in gt[:500]: conclusion = catch_phrases[-1] if conclusion not in conclusion_freq: conclusion_freq[conclusion] = 0 conclusion_freq[conclusion] += 1 conclusions = [(word, freq) for word, freq in conclusion_freq.items()] conclusions.sort(key=lambda x: x[1], reverse=True) for conclusion, _ in conclusions: print(conclusion)
def getIntroductions(self): """ Returns the first catch phrase of every doc :return: void """ gt = getGroundTruth() intro_word_freq = {} for full_text, catch_phrases in gt[:500]: intro_words = catch_phrases[0].split(" ") for word in intro_words: if word not in self.stop_words: if word not in intro_word_freq: intro_word_freq[word] = 0 intro_word_freq[word] += 1 intro_words = [(word, freq) for word, freq in intro_word_freq.items()] intro_words.sort(key=lambda x: x[1], reverse=True) print(intro_words)
def getHeadings(self): """ Returns the headings of whole text :return: void """ gt = getGroundTruth() pattern = re.compile(r'.+(\n )+\n.+') for full_text, catch_phrases in gt[:1]: print("".join(full_text)) headings = [] for sent in full_text: if pattern.search(sent) is not None: sent = re.sub(r'(\n( )*)+\n', r'\n', sent) headings.append(sent) print(len(headings)) for heading in headings: print("============================") print(heading)
def parseText(): """ Returns the headings of whole text :return: void """ gt = getGroundTruth() paraSegmentsFinal = [] for full_text, catch_phrases in gt[:100]: #print(full_text) paragraphs, headings, paragraphsUnderHeading = generateParagraph( full_text) for heading in headings: break #print(heading) #print(len(paragraphs), len(headings), len(paragraphsUnderHeading)) paraSegments = thematicSegmentation(paragraphs, headings, paragraphsUnderHeading) paraSegmentsFinal.append(paraSegments) return paraSegmentsFinal
def main(): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] #print("Fetching encoder model...", end=" ") #enc_model = SentenceTransformer('bert-base-nli-mean-tokens') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) #print("Done") for full_text, catch_phrases in gt: # Embed each sentence #sentence_embeddings = enc_model.encode(full_text) encoded = encoder.encode(full_text) # Cluster each embedding cluster_n = 11 #clusters = cluster(sentence_embeddings, minimum_samples=cluster_n) clusters = cluster(encoded, minimum_samples=cluster_n) centroids = [] for idx in range(cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) # Select representative cluster closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, encoded) ordering = sorted(range(cluster_n), key=lambda k: centroids[k]) print(ordering) summary = ' '.join([full_text[closest[idx]] for idx in ordering]).replace('\n', ' ') model_sum.append(summary) print([(full_text[closest[idx]], closest[idx]) for idx in ordering]) print(summary) print(len(catch_phrases)) print(".".join(catch_phrases)) gt_sum.append(".".join(catch_phrases)) break