class PlagiarismDetector(BaseService): plag_dao: PlagiarismDAO = inject(PlagiarismDAO) vectorizer = None @staticmethod def tokenize_and_stem(doc): """ Splits the document in tokens and then perform stemming. :param doc: :return: """ punctuation_remover = dict( (ord(char), None) for char in string.punctuation) tokens = nltk.word_tokenize(doc.lower().translate(punctuation_remover)) return PlagiarismDetector.stem_tokens(tokens) @staticmethod def stem_tokens(tokens): """ Stems the tokenized document. :param tokens: :return: """ stemmer = nltk.stem.porter.PorterStemmer() stemmed_tokens = [stemmer.stem(item) for item in tokens] return stemmed_tokens def cosine_similarity(self, source_doc, input_doc): """ Computes the similarity score for `input_doc` by matching it against `source_doc` using `TF-IDF` & `Cosine-similarity` :param source_doc: :param input_doc: :return: """ vectorizer = self.vectorizer or TfidfVectorizer( tokenizer=PlagiarismDetector.tokenize_and_stem, stop_words='english') tfidf = vectorizer.fit_transform([source_doc, input_doc]) return ((tfidf * tfidf.T).A)[0, 1] def compute_similarity(self, input_doc) -> Dict: """ Returns a dict containing highest possible similarity score and the most similar doc. :param input_doc: :return: """ most_similar_so_far = dict(similarity_score=-1, doc=None) for doc_info in self.plag_dao.yield_docs(): docs: List[Document] = doc_info['data'] for doc in docs: similarity_score = self.cosine_similarity( doc.content, input_doc) if similarity_score > most_similar_so_far['similarity_score']: most_similar_so_far['similarity_score'] = similarity_score most_similar_so_far['doc'] = doc return most_similar_so_far
class PlagiarismDetection(BaseController): plag_detector: PlagiarismDetector = inject(PlagiarismDetector) @intercept() def post(self, *args, **kwargs): """Detects plagiarism""" data = request.get_json(force=True) input_doc = data.get('text', None) if input_doc is None: ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'text').throw() most_similar_doc_info: Dict = self.plag_detector.compute_similarity(input_doc) most_similar_doc = most_similar_doc_info['doc'] similarity_score = most_similar_doc_info['similarity_score'] similarity_percentage = round(similarity_score * 100, 2) message = "Input text is {}% similar to the doc `{}` with similarity score of {}".format( similarity_percentage, most_similar_doc.title, similarity_score ) res_data = { 'similarity_score': similarity_score, 'similarity_percentage': similarity_percentage, 'doc': most_similar_doc.to_dict() } return Response(status_code=200, message=message, data=res_data)
class Document(BaseController): plag_dao: PlagiarismDAO = inject(PlagiarismDAO) elasticsearhobj = elasticsearch.ElasticSearchFunction() @intercept() def post(self, *args, **kwargs): """Adds a new document to repo""" data = request.get_json(force=True) content = data.get('content', '') title = data.get('title', '') description = data.get('description', '') author = data.get('author', '') if content and title: # Se agrega el documento en la BD doc = self.plag_dao.create_doc(content, title, description=description, author=author) #Se agrega el documento al índice en elasticsearh self.elasticsearhobj.add(doc.to_dict_es()) else: ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'content', 'title').throw() return Response(status_code=201, message='Document added successfully!') @intercept() def get(self): """ Fetches all the documents(paginated). :return: """ res = self.plag_dao.get_docs(page=int(request.args.get("page", 1)), per_page=int( request.args.get("per_page", 10)), all='all' in request.args) docs_info = dict(data=[d.to_dict() for d in res['data']], count=res['count']) print(docs_info) return Response(data=docs_info)
class BaseController(Resource): """ Every controller must extend this class. """ # Child controllers must override this property with default service for a specific module. __service_class__ = inject('service.base.BaseService') @property def service(self): """ Returns an instance of service class(as defined under `__service_class__`. i.e. default service) to be used inside a controller. Usage inside controller's action methods:: service_obj = self.service """ if self.__service_class__ is None: raise NotImplementedError( "Controller {} must override '__service_class__' property".format(self.__class__.__name__)) return self.__service_class__.inject if isinstance(self.__service_class__, (Injectable)) else self.__service_class__
def setUp(self): self.plag_dao: PlagiarismDAO = inject(PlagiarismDAO) self.plag_detector: PlagiarismDetector = inject(PlagiarismDetector) self.host = 'http://0.0.0.0:5000'
class PlagiarismDetection(BaseController): plag_detector: PlagiarismDetector = inject(PlagiarismDetector) elasticsearhobj = elasticsearch.ElasticSearchFunction() functions_plag_obj = functions_plag.FunctionsPlagiarism() @intercept() def post(self, *args, **kwargs): """Detects plagiarism""" #response_skl = [] response_es = [] highlight_response = [] my_uncommon_response = [] data = request.get_json(force=True) input_doc = data.get('text', None) if input_doc is None: ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'text').throw() # Se divide en párrafos el texto recibido token_text = sent_tokenize(input_doc) for paragraph_text in token_text: # Se detecta similitud haciendo uso de ElasticSearh responseES = self.elasticsearhobj.searchByContent(paragraph_text) # Se evalua cada párrafo retornado for highlight in responseES['hits']['hits'][0]['highlight'][ 'content']: parag_text_clean = self.functions_plag_obj.getStringClean( paragraph_text) highlight_clean = self.functions_plag_obj.getStringClean( highlight) uncommon_words = list( self.functions_plag_obj.getUncommonWords( parag_text_clean, highlight_clean)) my_uncommon_words = self.functions_plag_obj.getHighlightPerformance( uncommon_words, parag_text_clean) res_highlight_data = { 'content': highlight, 'levenshtein_distance': self.functions_plag_obj.getLevenshteinDistance( parag_text_clean, highlight_clean), 'similatiry_difflib': self.functions_plag_obj.getRatioSequenceMatcher( parag_text_clean, highlight_clean), 'uncommon_words': uncommon_words, 'my_uncommon_words': my_uncommon_words } highlight_response.append(res_highlight_data) # Se arma la respuesta a entregar en API res_es_data = { 'paragraph_text': paragraph_text, 'similarity_score': responseES['hits']['hits'][0]['_score'], 'similarity_percentage': responseES['hits']['hits'][0]['_score'], 'doc_': responseES['hits']['hits'][0]['_source'], 'highlight': highlight_response } response_es.append(res_es_data) # Respuesta final entregada en el POST super_res_data = {'response_elastic': response_es} return Response(status_code=200, message='Return info match', data=super_res_data)