def _compute_updatable_papers(self, papers): """ Computes which of the papers from the import will be touched for (re-)creation and creates model instances (without saving), if necessary. Returns a list of dicts of size len(papers) of format {db_paper, will_update}. db_paper=None indicates an error (possibly with the publication date), so the paper won't be created/updated. """ paper_informations = [] for i, paper in enumerate(papers): if not paper["published_at"]: self.log(f"Not importing {paper['doi']} because the date is missing.") paper_informations.append({"db_paper": None, "will_update": False}) continue try: db_paper = Paper.objects.get(doi=paper["doi"]) if DataSource.compare(db_paper.data_source_value, paper["datasource_id"]) >= 0: paper_informations.append({"db_paper": db_paper, "will_update": False}) continue else: # delete db_paper and recreate -> easier to handle using bulk create db_paper.delete() db_paper = Paper(doi=paper["doi"]) except Paper.DoesNotExist: db_paper = Paper(doi=paper["doi"]) paper_informations.append({"db_paper": db_paper, "will_update": True}) return paper_informations
def handle(self, *args, **options): literature_results = json.load(options['literature']) variants_found_in_papers = literature_results['variants'] papers = literature_results['papers'] crawl_date = literature_results['date'] # Soft delete all existing records (they will be undeleted if they're # in the new data) existing_papers = Paper.objects.all() existing_papers.update(deleted=True) existing_variant_papers = VariantPaper.objects.all() existing_variant_papers.update(deleted=True) paper_objects = {} for pmid, paper in papers.iteritems(): query = Paper.objects.filter(pmid=pmid) if query.count() > 0: # we already have this paper in the database paper_objects[pmid] = query[0] query.update(deleted=False, crawl_date=crawl_date) else: if not paper['year']: paper['year'] = '0000' p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \ keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \ deleted=False, pmid=paper['pmid'], crawl_date=crawl_date) p.save() paper_objects[pmid] = p for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems( ): for variant in variant_instances: pmid = variant['pmid'] points = variant['points'] mentions = variant['mentions'] if pmid in paper_objects: paper = paper_objects[pmid] if mentions == None: mentions = [] query = VariantPaper.objects.filter( variant_hg38=variant_genomic_coordinate, paper=paper) if query.count() > 0: # we already have this variantpaper query.update(mentions=mentions, points=points, deleted=False) else: vp = VariantPaper( variant_hg38=variant_genomic_coordinate, paper=paper, points=points, mentions=mentions, deleted=False) vp.save()
def insert(self, datapoint: SerializableArticleRecord): self._validate_integrity_constraints(datapoint) if IgnoredPaper.objects.filter(doi=datapoint.doi).exists(): raise DatabaseUpdate.SkipArticle("DOI is on ignore list") conflict = False try: with transaction.atomic(): try: db_article = Paper.objects.get(doi=datapoint.doi) created = False except Paper.DoesNotExist: db_article = Paper(doi=datapoint.doi) created = True if not created: datasource_comparison = DataSource.compare( db_article.data_source_value, datapoint.datasource) if datasource_comparison > 0: datasource_name = DataSource( db_article.data_source_value).name raise DatabaseUpdate.SkipArticle( f"Article already tracked by {datasource_name}") elif not self.force_update and not self.update_existing and datasource_comparison == 0: raise DatabaseUpdate.SkipArticle( "Article already in database") changed_externally = db_article.scrape_hash != datapoint.md5 changed_internally = db_article.manually_modified if not self.force_update and not changed_externally: db_article.last_scrape = timezone.now() db_article.save() return db_article, False, False # Article was neither created, nor updated if changed_internally: conflict = True raise DatabaseUpdate.Error( "Conflict: Manual modification and external change" ) self._update(db_article, datapoint) except DatabaseUpdate.Error as ex: if conflict: self._handle_conflict(db_article, datapoint) raise ex return db_article, created, True # Article was updated
def handle(self, *args, **options): literature_results = json.load(options['literature']) variants_found_in_papers = literature_results['variants'] papers = literature_results['papers'] crawl_date = literature_results['date'] # Soft delete all existing records (they will be undeleted if they're # in the new data) existing_papers = Paper.objects.all() existing_papers.update(deleted=True) existing_variant_papers = VariantPaper.objects.all() existing_variant_papers.update(deleted=True) paper_objects = {} for pmid, paper in papers.iteritems(): query = Paper.objects.filter(pmid=pmid) if query.count() > 0: # we already have this paper in the database paper_objects[pmid] = query[0] query.update(deleted=False, crawl_date=crawl_date) else: if not paper['year']: paper['year'] = '0000' p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \ keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \ deleted=False, pmid=paper['pmid'], crawl_date=crawl_date) p.save() paper_objects[pmid] = p for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems(): for variant in variant_instances: pmid = variant['pmid'] points = variant['points'] mentions = variant['mentions'] if pmid in paper_objects: paper = paper_objects[pmid] if mentions == None: mentions = [] query = VariantPaper.objects.filter(variant_hg38=variant_genomic_coordinate, paper=paper) if query.count() > 0: # we already have this variantpaper query.update(mentions=mentions, points=points, deleted=False) else: vp = VariantPaper(variant_hg38=variant_genomic_coordinate, paper=paper, points=points, mentions=mentions, deleted=False) vp.save()
def _validate_integrity_constraints(datapoint: SerializableArticleRecord): error = None error_msg = None if not datapoint.doi: error = "Missing DOI" elif not datapoint.title: error = "Missing title" elif len(datapoint.title) > Paper.max_length('title'): error = "Title too long" error_msg = error + f": {datapoint.title}" elif not datapoint.abstract: error = "Missing abstract" elif not datapoint.publication_date: error = "Missing publication date" if datapoint.doi and '\n' in datapoint.doi: error = "DOI has line breaks" author_count = 0 for author in datapoint.authors: if ((author[1] and len(author[1]) > Author.max_length("first_name")) or (author[0] and len(author[0]) > Author.max_length("last_name"))): error = "Author name too long" error_msg = error + f": {author[0]}, {author[1]}" if not AuthorNameResolution.objects.filter( source_first_name=author[1], source_last_name=author[0], target_author=None).exists(): # Count only authors that are not on the author ignore list author_count += 1 if author_count == 0: error = "No authors" if error: if not error_msg: error_msg = error raise DatabaseUpdate.Error(error_msg)
def index(request): global GDRIVE_DIR_ID bulk = BulkUploadForm() upl = UploadForm() if request.method == "POST": try: assert request.FILES.get('file', None) is not None # UploadForm is submitted upl = UploadForm(request.POST, request.FILES) if upl.is_valid(): path = STATICFILES_DIRS[0] uid = uuid.uuid4() path = os.path.join(path, "files", "{}.pdf".format(uid)) file = request.FILES.get('file') with open(path, 'wb+') as dest: for chunk in file.chunks(): dest.write(chunk) if not GDRIVE_DIR_ID: GDRIVE_DIR_ID = get_or_create_folder(GDRIVE_DIRNAME, public=True) paper = upl.save(commit=False) paper.link = upload_file(path, "{}.pdf".format(uid), folderId=GDRIVE_DIR_ID) keys_tmp = upl.cleaned_data.get("keywords") if upl.cleaned_data.get('custom_subject', '') != '': paper.subject = upl.cleaned_data.get('custom_subject') paper.save() for key in keys_tmp: paper.keywords.add(key) paper.save() LOG.info("New file uploaded: {}.pdf".format(uid)) messages.success(request, "File Upload Successful") try: del_key = request.POST.get('del_key', 0) key = int(del_key) if key > 0: PaperRequest.objects.filter(pk=key).delete() LOG.info("Request {} cleared".format(key)) except Exception as e: LOG.warning(e) os.remove(path) except AssertionError: if request.user.is_staff: # BulkUploadForm has been submitted bulk = BulkUploadForm(request.POST, request.FILES) processed = 0 saved = 0 if bulk.is_valid(): raw_papers = json.load(request.FILES.get('bulk_file')) for paper in raw_papers: processed += 1 dep_code = str(paper.get("Department", "Other")) if dep_code == "": dep_code = "Other" dep, _ = Department.objects\ .get_or_create(code=dep_code) p = Paper(department=dep, year=paper.get("Year", None), subject=paper.get("Paper", None), link=paper.get("Link", None), paper_type=paper.get("Semester", None)) try: p.save() saved += 1 except Exception as e: LOG.warning(e) LOG.info("%d entries processed, %d entries saved" % (processed, saved)) messages.success( request, "Bulk upload successful:\ {} entries saved".format(saved)) return render(request, "upload.html", { "bulk_form": bulk, "crowd_form": upl })
def _update(self, db_article: Paper, datapoint: SerializableArticleRecord): db_article.title = datapoint.title db_article.abstract = datapoint.abstract db_article.published_at = datapoint.publication_date db_article.url = datapoint.url db_article.pdf_url = datapoint.pdf_url db_article.is_preprint = datapoint.is_preprint db_article.pubmed_id = datapoint.pubmed_id db_article.data_source_value = self.datasource db_article.covid_related = covid_related(db_article=db_article) if self.datasource.check_covid_related and not db_article.covid_related: raise DatabaseUpdate.Error("Article not covid related.") db_article.host, _ = PaperHost.objects.get_or_create( name=datapoint.paperhost) db_article.visualized = False db_article.vectorized = False db_article.save() AuthorPaperMembership.objects.filter(paper=db_article).delete() rank = 0 for author in datapoint.authors: db_author, _ = Author.get_or_create_by_name(first_name=author[1], last_name=author[0]) if db_author is not None: AuthorPaperMembership.objects.create(paper=db_article, author=db_author, rank=rank) rank += 1 if datapoint.journal: db_article.journal, _ = Journal.objects.get_or_create( name=datapoint.journal[:Journal.max_length("name")]) db_article.version = datapoint.version db_article.last_scrape = timezone.now() db_article.categories.clear() db_article.scrape_hash = datapoint.md5 db_article.save()
def _import_papers(self, papers, paper_informations, authors, import_locations, import_ml_categories, import_journals, tar): """ Import papers and its associated authors. Also its relations with locations, categories and journals, depending on the bool parameters. The mapping of all things (except authors) must have been built before using this. """ paper_title_max_len = Paper.max_length("title") author_firstname_max_len = Author.max_length("first_name") author_lastname_max_len = Author.max_length("last_name") papers_to_add = [] category_memberships_to_create = [] location_memberships_to_create = [] for i, (paper, paper_info) in enumerate(zip(papers, paper_informations)): db_paper = paper_info["db_paper"] if not db_paper: continue if paper_info["will_update"]: db_paper.title = paper["title"][:paper_title_max_len] db_paper.abstract = paper["abstract"] db_paper.data_source_value = paper["datasource_id"] db_paper.version = paper["version"] db_paper.covid_related = paper["covid_related"] db_paper.url = paper["url"] db_paper.pdf_url = paper["pdf_url"] db_paper.is_preprint = paper["is_preprint"] db_paper.published_at = paper["published_at"] db_paper.last_scrape = make_aware( datetime.strptime(paper["last_scrape"], "%Y-%m-%d %H:%M:%S") ) if paper["last_scrape"] else None if self.export_version > 4: db_paper.scrape_hash = paper["scrape_hash"] if self.export_version > 5: db_paper.manually_modified = paper["manually_modified"] db_paper.host = self._mappings.paperhost_mapping[paper["paperhost_id"]] if paper[ "paperhost_id"] else None db_paper.pubmed_id = paper["pubmed_id"] if "pubmed_id" in paper else None db_paper.journal = ( self._mappings.journal_mapping[paper["journal_id"]] if import_journals and paper[ "journal_id"] else None ) db_paper.data = self._mappings.paperdata_mapping[ db_paper.doi] if db_paper.doi in self._mappings.paperdata_mapping else None if self.export_version >= 4: db_paper.visualized = paper["visualized"] db_paper.vectorized = paper["vectorized"] img_path = paper["image"] if img_path: with tar.extractfile(img_path) as img_file: image = Image.open(img_file) buffer = BytesIO() image.save(buffer, format="JPEG") db_paper.add_preview_image(buffer, save=False) papers_to_add.append(db_paper) self.statistics.added_papers += 1 self._mappings.doi_to_author_mapping[db_paper.doi] = [] # maps doi to a list of its db_authors for author_id in paper["author_ids"]: author = authors[author_id] author_tuple = (author["firstname"][:author_firstname_max_len], author["lastname"][:author_lastname_max_len]) try: db_author = Author.objects.get(first_name=author["firstname"][:author_firstname_max_len], last_name=author["lastname"][:author_lastname_max_len]) self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": False} except Author.DoesNotExist: if author_tuple in self._mappings.db_author_mapping: # author was already requested earlier db_author = self._mappings.db_author_mapping[author_tuple]["db_author"] else: db_author = Author(first_name=author["firstname"][:author_firstname_max_len], last_name=author["lastname"][:author_lastname_max_len]) self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": True} self.statistics.authors_created += 1 self._mappings.doi_to_author_mapping[db_paper.doi].append(db_author) if import_ml_categories and not db_paper.categories.exists(): # Set paper categories if they were not set (even on existing papers) if paper["category_memberships"]: self.statistics.papers_w_new_category += 1 for category in paper["category_memberships"]: membership = CategoryMembership(paper=db_paper, category=self._mappings.category_mapping[category["identifier"]], score=category["score"]) category_memberships_to_create.append(membership) if import_locations and not db_paper.locations.exists(): # Set paper locations if they were not set (even on existing papers) if paper["locations"]: self.statistics.papers_w_new_location += 1 db_paper.location_modified = paper["location_modified"] for location in paper["locations"]: membership = GeoLocationMembership(paper=db_paper, location=self._mappings.location_mapping[location["id"]], state=location["state"]) location_memberships_to_create.append(membership) Paper.objects.bulk_create(papers_to_add) Author.objects.bulk_create([author["db_author"] for author in self._mappings.db_author_mapping.values() if author["created"]]) CategoryMembership.objects.bulk_create(category_memberships_to_create) GeoLocationMembership.objects.bulk_create(location_memberships_to_create) author_paper_memberships = [] for doi, authors in self._mappings.doi_to_author_mapping.items(): author_paper_memberships += [AuthorPaperMembership(paper_id=doi, author_id=author.pk, rank=i) for i, author in enumerate(authors)] AuthorPaperMembership.objects.bulk_create(author_paper_memberships) # recompute counts because post save signals are not triggered on bulk create GeoLocation.recompute_counts(GeoCity.objects.all(), GeoCountry.objects.all())