def __init__(self, data): self.id = data.get('_id') if '_id' in data else '' self.filename = data.get('filename') self.title_raw = data.get('title_raw') if 'title_raw' in data else '' self.title_proceed = data.get( 'title_proceed' ) if 'title_proceed' in data else TextProcessor.proceed_string( self.title_raw) self.authors = [Authors(author) for author in data.get('authors') ] if 'authors' in data else [] self.sections = [Section(section) for section in data.get('sections') ] if 'sections' in data else [] self.references = [ Reference(reference) for reference in data.get('references') ] if 'references' in data else [] self.cited_by = data.get('cited_by') if 'cited_by' in data else [] self.word_hist = WordHist( data.get('word_hist')) if "word_hist" in data else WordHist() try: self.file = data.get('file') if 'file' in data else open( UPLOAD_FOLDER + self.filename, "rb").read() except FileNotFoundError as e: print("Cant import file: {}. This should only happen in Tests". format(e)) self.file = bytearray()
def __init__(self, data): self.text_type = TextType[data.get('text_type')] self.text_raw = data.get('text_raw') self.text_proceed = data.get( 'text_proceed' ) if 'text_proceed' in data else TextProcessor.proceed_string( data.get('text_raw'))
class Preprocessor(object): def __init__(self): self.imrad_detector = IMRaDDetection() self.text_processor = TextProcessor() self.client = DBClient() def __add_paper_to_reference(self, paper1, paper2): if not paper2.title_proceed: return for ref in paper1.references: similarity = SequenceMatcher(None, ref.complete_ref_raw.lower(), paper2.title_raw.lower()).ratio() if similarity >= REFERENCE_SIMULARITY_THRESHOLD: ref.paper_id = [paper2.id, "automated"] self.client.update_paper(paper1) def proceed_paper(self, paper): self.text_processor.proceed_paper(paper) self.imrad_detector.proceed(paper) def proceed_queries(self, queries): queries_proceed = {} if "whole-document" in queries else { "whole-document": "" } for imrad_type, query in queries.items(): queries_proceed[imrad_type] = self.text_processor.proceed_string( query) return queries_proceed def link_references(self, new_paper): for paper in self.client.get_all_paper(): self.__add_paper_to_reference(paper, new_paper) self.__add_paper_to_reference(new_paper, paper)
def __init__(self, data): self.complete_ref_raw = data.get('complete_ref_raw') self.complete_ref_proceed = data.get('complete_ref_proceed') if 'complete_ref_proceed' in data else \ TextProcessor.proceed_string(data.get('complete_ref_raw')) self.title = data.get('title') if 'title' in data else '' self.paper_id = data.get('paper_id') if 'paper_id' in data else '' self.authors = [[ ReferenceType[author.get('author_type')], Author(author.get('author')) ] for author in data.get('authors')] if 'authors' in data else [] self.reference_info = [[ ReferenceType[info.get('reference_type')], info.get('reference_text') ] for info in data.get('reference_info') ] if 'reference_info' in data else []
def __init__(self, data): self.heading_raw = data.get('heading_raw') self.heading_proceed = data.get('heading_proceed') if 'heading_proceed' in data else \ TextProcessor.proceed_string(data.get('heading_raw')) self.section_type = SectionType[data.get('section_type')] self.imrad_types = [ IMRaDType[imrad_type] for imrad_type in data.get('imrad_types') ] if 'imrad_types' in data else [] self.text = [Text(text) for text in data.get('text')] if 'text' in data else [] self.subsections = [ Section(subsection) for subsection in data.get('subsections') ] if 'subsections' in data else [] self.word_hist = WordHist( data.get('word_hist')) if "word_hist" in data else WordHist()
def set_title(self, title_raw): if title_raw != '': self.title_raw = title_raw self.title_proceed = TextProcessor.proceed_string(title_raw)