def create_dataset(): classifier = ClassifierSimple() for filename in os.listdir(UPLOAD_FOLDER): if filename.endswith('.pdf'): importer = ImporterTeambeam() paper = importer.import_paper(filename) text_processing = TextProcessor() text_processing.proceed_paper(paper) chapter_names = [name.heading for name in paper.sections if not (name.heading.isspace() or name.heading is '')] if not len(chapter_names): continue prob = classifier.predict_chapter(chapter_names) for i in range(len(prob)): tmp = "" if prob[i][IMRaDType.ABSTRACT.value] == 1: tmp += IMRaDType.ABSTRACT.name + " " if prob[i][IMRaDType.INTRODUCTION.value] == 1: tmp += IMRaDType.INTRODUCTION.name + " " if prob[i][IMRaDType.BACKGROUND.value] == 1: tmp += IMRaDType.BACKGROUND.name + " " if prob[i][IMRaDType.RESULTS.value] == 1: tmp += IMRaDType.RESULTS.name + " " if prob[i][IMRaDType.DISCUSSION.value] == 1: tmp += IMRaDType.DISCUSSION.name + " " if prob[i][IMRaDType.ACKNOWLEDGE.value] == 1: tmp += IMRaDType.ACKNOWLEDGE.name + " " if tmp is not "": print("{0}: {1}".format(chapter_names[i], tmp))
class Preprocessor(object): def __init__(self): self.imrad_detector = IMRaDDetection() self.text_processor = TextProcessor() self.client = DBClient() def __add_paper_to_reference(self, paper1, paper2): if not paper2.title_proceed: return for ref in paper1.references: similarity = SequenceMatcher(None, ref.complete_ref_raw.lower(), paper2.title_raw.lower()).ratio() if similarity >= REFERENCE_SIMULARITY_THRESHOLD: ref.paper_id = [paper2.id, "automated"] self.client.update_paper(paper1) def proceed_paper(self, paper): self.text_processor.proceed_paper(paper) self.imrad_detector.proceed(paper) def proceed_queries(self, queries): queries_proceed = {} if "whole-document" in queries else { "whole-document": "" } for imrad_type, query in queries.items(): queries_proceed[imrad_type] = self.text_processor.proceed_string( query) return queries_proceed def link_references(self, new_paper): for paper in self.client.get_all_paper(): self.__add_paper_to_reference(paper, new_paper) self.__add_paper_to_reference(new_paper, paper)