def __init__(self): # fn_docs = 'mycorpus.txt' self.prior_case_dir = 'training_data/Prior_Cases/' self.current_case_directory = "training_data/Current_Cases/" self.qg = QueryGenerator() self.preprocessor = Preprocessor() # self.wordnet = WordnetSimilarityEvaluator() self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir) """ with multiprocessing.Manager() as m: self.synset_contents = m.dict() self.synset_generator(self.prior_case_dir) d = {k: dict(v) if isinstance(v, DictProxy) else v for k, v in self.synset_contents.items()} self.synset_contents = copy.deepcopy(d) """ # manager = multiprocessing.Manager() self.synset_contents = dict() self.synset_generator(self.prior_case_dir) self.citation_similarity_scorer = CitationSimilarity() self.citation_similarity_scorer.synset_contents = self.synset_contents self.evaluator = IREvaluator('training_data/qrel.json') self.doc_wise_results = dict() self.doc_wise_ranking = dict() self.dir = "" self.results = dict()
def __init__(self): # fn_docs = 'mycorpus.txt' self.prior_case_dir = 'training_data/Prior_Cases/' self.current_case_directory = "training_data/Current_Cases/" self.bm25 = BM25(self.prior_case_dir, delimiter=' ') self.qg = QueryGenerator() self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir) self.evaluator = IREvaluator('training_data/qrel.json') self.doc_wise_results = dict() self.doc_wise_ranking = dict() self.results = dict()
def __init__(self): # fn_docs = 'mycorpus.txt' self.prior_case_dir = 'training_data/Prior_Cases/' self.current_case_directory = "training_data/Current_Cases/" self.qg = QueryGenerator() self.preprocessor = Preprocessor() self.wordnet = WordnetSimilarityEvaluator() self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir) self.sysnet_contents = dict() self.sysnet_generator(self.prior_case_dir) self.evaluator = IREvaluator('training_data/qrel.json') self.doc_wise_results = dict() self.doc_wise_ranking = dict() self.results = dict()
class WordnetRunnerMultiprocessSerial: def __init__(self): # fn_docs = 'mycorpus.txt' self.prior_case_dir = 'training_data/Prior_Cases/' self.current_case_directory = "training_data/Current_Cases/" self.qg = QueryGenerator() self.preprocessor = Preprocessor() # self.wordnet = WordnetSimilarityEvaluator() self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir) """ with multiprocessing.Manager() as m: self.synset_contents = m.dict() self.synset_generator(self.prior_case_dir) d = {k: dict(v) if isinstance(v, DictProxy) else v for k, v in self.synset_contents.items()} self.synset_contents = copy.deepcopy(d) """ # manager = multiprocessing.Manager() self.synset_contents = dict() self.synset_generator(self.prior_case_dir) self.citation_similarity_scorer = CitationSimilarity() self.citation_similarity_scorer.synset_contents = self.synset_contents self.evaluator = IREvaluator('training_data/qrel.json') self.doc_wise_results = dict() self.doc_wise_ranking = dict() self.dir = "" self.results = dict() """ def convert_file_to_synset(self, filename): with open(os.path.join(self.dir, str(filename)), 'r') as f: # content = self.preprocessor.preprocess(f.read().lower()) content = f.read().lower() print("Converting to synset contents of file : " + str(filename) + " ; pid : " + str(os.getpid())) self.synset_contents[filename] = self.wordnet.doc_to_synsets(content) # print("Converted to synset contents of file : " + str(filename) + " ; pid : " + str(os.getpid())) # return self.synset_contents[filename] # return list(self.wordnet.doc_to_synsets(content)) """ def synset_generator(self, directory): file_counter = 1 self.dir = directory file_list = [] for file in sorted(os.listdir(directory), key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): file_counter = file_counter + 1 filename = os.fsdecode(file) if filename.endswith(".txt"): # print(os.path.join(directory), str(filename)) # file_list.append(filename) with open(os.path.join(self.dir, str(filename)), 'r') as f: content = self.preprocessor.preprocess(f.read().lower()) # content = f.read().lower() print("Adding to dict contents of file : " + str(filename) + " ; pid : " + str(os.getpid())) # self.synset_contents[filename] = self.wordnet.doc_to_synsets(content) self.synset_contents[filename] = content # print("Converted to synset contents of file : " + str(filename) + " ; pid : " + str(os.getpid())) # print(self.synset_contents) """ p = multiprocessing.Pool() # map list to target function # synset_list = p.map(self.convert_file_to_synset, file_list) p.map(self.convert_file_to_synset, file_list) p.close() p.join() p.clear() """ # for i in range(len(file_list)): # self.synset_contents[file_list[i]] = copy.deepcopy(synset_list[i]) # print(self.synset_contents) def iter_as_list(self, n, iterable): "Return first n items of the iterable as a list" return list(islice(iterable, n)) def convert_to_dict(self, scores): n = 1 score_dict = dict() for score in scores: score_dict['prior_case_' + str('0' * (4 - len(str(n))) + str(n) + '.txt')] = score n = n + 1 return score_dict def sort_by_document(self, results): for prior_case in results: doc_score = dict() for citation in results[prior_case]: for doc, score in results[prior_case][citation].items(): if doc not in doc_score.keys(): doc_score[doc] = score else: if score > doc_score[doc]: doc_score[doc] = score doc_ranking = [ k for k in sorted(doc_score, key=doc_score.get, reverse=True) ] doc_score = [ (k, doc_score[k]) for k in sorted(doc_score, key=doc_score.get, reverse=True) ] # doc_score = OrderedDict(sorted(doc_score.items(), key=itemgetter(1), reverse=True)) self.doc_wise_results[prior_case] = doc_score self.doc_wise_ranking[prior_case] = doc_ranking return self.doc_wise_results, self.doc_wise_ranking def query_wordnet_similarity_ranking(self, query, log_str): """ scores = [] counter = 0 for file in sorted(self.synset_contents.keys(), key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): scores.append(self.wordnet.sysnset_path_similarity(self.wordnet.doc_to_synsets(query), self.synset_contents[file])) print("Comparison no. : " + str(counter)) sys.stdout.flush() counter = counter + 1 return scores """ return self.citation_similarity_scorer.citation_similarity_scores( query, log_str) # def dummy(self, current_case): # with open(os.path.join(self.current_case_directory, str(current_case)), 'r') as f: # content = f.read() # surrounding_text_blocks = self.qg.generate_query(content) # citation_counter = 0 # for surrounding_text in surrounding_text_blocks: # citation_counter = citation_counter + 1 # preprocessed_text = self.preprocessor.preprocess(surrounding_text) # idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text) # # take only top 50% here # selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items()) # selected_words = " ".join([i[0] for i in selected_words_idf_scores]) # # print(selected_words) # print("Current Case : " + str(current_case) + " ; Citation counter : " + str(citation_counter)) # sys.stdout.flush() # wordnet_scores = self.query_wordnet_similarity_ranking(selected_words) # wordnet_score_dict = self.convert_to_dict(wordnet_scores) # return current_case """ def predict_prior_cases_for_current_case(self, current_case): # manager = self.m results = dict() with open(os.path.join(self.current_case_directory, str(current_case)), 'r') as f: content = f.read() surrounding_text_blocks = self.qg.generate_query(content) citation_counter = 0 for surrounding_text in surrounding_text_blocks: preprocessed_text = self.preprocessor.preprocess(surrounding_text) idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text) # take only top 50% here selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items()) selected_words = " ".join([i[0] for i in selected_words_idf_scores]) # print(selected_words) log_str = "Current Case : " + str(current_case) + " ; Citation counter : " + str(citation_counter) \ + " ; " print(log_str) sys.stdout.flush() wordnet_scores = self.query_wordnet_similarity_ranking(selected_words, log_str) wordnet_score_dict = self.convert_to_dict(wordnet_scores) if current_case not in results.keys(): results[current_case] = dict() results[current_case][citation_counter] = copy.deepcopy(wordnet_score_dict) citation_counter = citation_counter + 1 # print("results : " + str(results)) return results """ """ def predict_prior_cases_for_citation(self, surrounding_text, citation_counter): print("Citation counter : " + str(citation_counter) + " ; start") # print("1") sys.stdout.flush() preprocessed_text = self.preprocessor.preprocess(surrounding_text) idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text) # take only top 50% here selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items()) selected_words = " ".join([i[0] for i in selected_words_idf_scores]) # print(selected_words) wordnet_scores = self.query_wordnet_similarity_ranking(selected_words) wordnet_score_dict = self.convert_to_dict(wordnet_scores) print("Citation counter : " + str(citation_counter) + " ; end") return wordnet_score_dict """ def execute_baseline(self): results = dict() # os.listdir(self.current_case_directory) for file in sorted([ 'current_case_0001.txt', 'current_case_0030.txt', 'current_case_0034.txt', 'current_case_0068.txt', 'current_case_0070.txt', 'current_case_0110.txt', 'current_case_0133.txt', 'current_case_0163.txt' ], key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): filename = os.fsdecode(file) if filename.endswith(".txt"): print("Current case : " + filename) with open( os.path.join(self.current_case_directory, str(filename)), 'r') as f: content = f.read() surrounding_text_blocks = self.qg.generate_query(content) # surrounding_text_pairs = [] # for surrounding_text_block in surrounding_text_blocks: # surrounding_text_pairs.append((surrounding_text_block, counter)) citation_counter = 0 for surrounding_text in surrounding_text_blocks: preprocessed_text = self.preprocessor.preprocess( surrounding_text) idf_score = self.idf_score_evaluator.get_idf_score( preprocessed_text) # take only top 50% here selected_words_idf_scores = self.iter_as_list( len(idf_score) // 2, idf_score.items()) selected_words = " ".join( [i[0] for i in selected_words_idf_scores]) # print(selected_words) log_str = "Current Case : " + str( filename) + " ; Citation counter : " + str( citation_counter) + " ; " print(log_str) sys.stdout.flush() wordnet_scores = self.query_wordnet_similarity_ranking( selected_words, log_str) wordnet_score_dict = self.convert_to_dict( wordnet_scores) if filename not in results.keys(): results[filename] = dict() results[filename][citation_counter] = copy.deepcopy( wordnet_score_dict) citation_counter = citation_counter + 1 """ p = pathos.multiprocessing.ProcessingPool() # p = multiprocessing.Pool() # current_case_results = p.map(self.predict_prior_cases_for_citation, surrounding_text_blocks) current_case_results = p.map(self.predict_prior_cases_for_citation, surrounding_text_blocks, [i for i in range(len(surrounding_text_blocks))]) # print(results) p.close() p.join() if filename not in results.keys(): results[filename] = dict() citation_counter = 0 for citation_result in current_case_results: results[filename][citation_counter] = copy.deepcopy(citation_result) citation_counter = citation_counter + 1 """ self.results = copy.deepcopy(results) """ citation_counter = 0 for surrounding_text in surrounding_text_blocks: preprocessed_text = self.preprocessor.preprocess(surrounding_text) idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text) # take only top 50% here selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items()) selected_words = " ".join([i[0] for i in selected_words_idf_scores]) # print(selected_words) print( "Current Case : " + str(filename) + " ; Citation counter : " + str(citation_counter)) sys.stdout.flush() wordnet_scores = self.query_wordnet_similarity_ranking(selected_words) wordnet_score_dict = self.convert_to_dict(wordnet_scores) if filename not in results.keys(): results[filename] = dict() results[filename][citation_counter] = copy.deepcopy(wordnet_score_dict) citation_counter = citation_counter + 1 """ def get_eval_scores(self, results): doc_wise_results, doc_wise_ranking = self.sort_by_document(results) # print(doc_wise_results) print(self.evaluator.get_doc_wise_results(doc_wise_ranking)) print("___________________________________________") final_scores = self.evaluator.get_total_scores() # print(final_scores) return final_scores
class WordnetRunner: def __init__(self): # fn_docs = 'mycorpus.txt' self.prior_case_dir = 'training_data/Prior_Cases/' self.current_case_directory = "training_data/Current_Cases/" self.qg = QueryGenerator() self.preprocessor = Preprocessor() self.wordnet = WordnetSimilarityEvaluator() self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir) self.sysnet_contents = dict() self.sysnet_generator(self.prior_case_dir) self.evaluator = IREvaluator('training_data/qrel.json') self.doc_wise_results = dict() self.doc_wise_ranking = dict() self.results = dict() def sysnet_generator(self, directory): file_counter = 1 for file in sorted(os.listdir(directory), key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): print("Converting to sysnet of file no. : " + str(file_counter)) file_counter = file_counter + 1 filename = os.fsdecode(file) if filename.endswith(".txt"): # print(os.path.join(directory), str(filename)) with open(os.path.join(directory, str(filename)), 'r') as f: # content = self.preprocessor.preprocess(f.read().lower()) content = f.read().lower() self.sysnet_contents[filename] = self.wordnet.doc_to_synsets(content) def iter_as_list(self, n, iterable): "Return first n items of the iterable as a list" return list(islice(iterable, n)) def convert_to_dict(self, scores): n = 1 score_dict = dict() for score in scores: score_dict['prior_case_' + str('0'*(4-len(str(n))) + str(n) + '.txt')] = score n = n + 1 return score_dict def sort_by_document(self, results): for prior_case in results: doc_score = dict() for citation in results[prior_case]: for doc, score in results[prior_case][citation].items(): if doc not in doc_score.keys(): doc_score[doc] = score else: if score > doc_score[doc]: doc_score[doc] = score doc_ranking = [k for k in sorted(doc_score, key=doc_score.get, reverse=True)] doc_score = [(k, doc_score[k]) for k in sorted(doc_score, key=doc_score.get, reverse=True)] # doc_score = OrderedDict(sorted(doc_score.items(), key=itemgetter(1), reverse=True)) self.doc_wise_results[prior_case] = doc_score self.doc_wise_ranking[prior_case] = doc_ranking return self.doc_wise_results, self.doc_wise_ranking def query_wordnet_similarity_ranking(self, query): scores = [] counter = 0 for file in sorted(self.sysnet_contents.keys(), key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): scores.append(self.wordnet.sysnset_path_similarity(self.wordnet.doc_to_synsets(query), self.sysnet_contents[file])) print("Comparison no. : " + str(counter)) sys.stdout.flush() counter = counter + 1 return scores def execute_baseline(self): no_of_cases = 0 for file in sorted(os.listdir(self.current_case_directory), key=lambda item: (int(item.partition('_')[2]) if item[0].isdigit() else float('inf'), item)): filename = os.fsdecode(file) if filename.endswith(".txt"): with open(os.path.join(self.current_case_directory, str(filename)), 'r') as f: content = f.read() surrounding_text_blocks = self.qg.generate_query(content) citation_counter = 0 for surrounding_text in surrounding_text_blocks: preprocessed_text = self.preprocessor.preprocess(surrounding_text) idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text) # take only top 50% here selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items()) selected_words = " ".join([i[0] for i in selected_words_idf_scores]) # print(selected_words) wordnet_scores = self.query_wordnet_similarity_ranking(selected_words) wordnet_score_dict = self.convert_to_dict(wordnet_scores) if filename not in self.results.keys(): self.results[filename] = dict() self.results[filename][citation_counter] = copy.deepcopy(wordnet_score_dict) citation_counter = citation_counter + 1 print("Citation counter : " + str(citation_counter)) sys.stdout.flush() # to execute only for few files no_of_cases = no_of_cases + 1 print(str(no_of_cases) + " : " + filename) sys.stdout.flush() if no_of_cases == 3: break return self.results def get_eval_scores(self, results): doc_wise_results, doc_wise_ranking = self.sort_by_document(results) # print(doc_wise_results) print(self.evaluator.get_doc_wise_results(doc_wise_ranking)) print("___________________________________________") final_scores = self.evaluator.get_total_scores() # print(final_scores) return final_scores