def _compute_updatable_papers(self, papers): """ Computes which of the papers from the import will be touched for (re-)creation and creates model instances (without saving), if necessary. Returns a list of dicts of size len(papers) of format {db_paper, will_update}. db_paper=None indicates an error (possibly with the publication date), so the paper won't be created/updated. """ paper_informations = [] for i, paper in enumerate(papers): if not paper["published_at"]: self.log(f"Not importing {paper['doi']} because the date is missing.") paper_informations.append({"db_paper": None, "will_update": False}) continue try: db_paper = Paper.objects.get(doi=paper["doi"]) if DataSource.compare(db_paper.data_source_value, paper["datasource_id"]) >= 0: paper_informations.append({"db_paper": db_paper, "will_update": False}) continue else: # delete db_paper and recreate -> easier to handle using bulk create db_paper.delete() db_paper = Paper(doi=paper["doi"]) except Paper.DoesNotExist: db_paper = Paper(doi=paper["doi"]) paper_informations.append({"db_paper": db_paper, "will_update": True}) return paper_informations
def handle(self, *args, **options): literature_results = json.load(options['literature']) variants_found_in_papers = literature_results['variants'] papers = literature_results['papers'] crawl_date = literature_results['date'] # Soft delete all existing records (they will be undeleted if they're # in the new data) existing_papers = Paper.objects.all() existing_papers.update(deleted=True) existing_variant_papers = VariantPaper.objects.all() existing_variant_papers.update(deleted=True) paper_objects = {} for pmid, paper in papers.iteritems(): query = Paper.objects.filter(pmid=pmid) if query.count() > 0: # we already have this paper in the database paper_objects[pmid] = query[0] query.update(deleted=False, crawl_date=crawl_date) else: if not paper['year']: paper['year'] = '0000' p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \ keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \ deleted=False, pmid=paper['pmid'], crawl_date=crawl_date) p.save() paper_objects[pmid] = p for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems( ): for variant in variant_instances: pmid = variant['pmid'] points = variant['points'] mentions = variant['mentions'] if pmid in paper_objects: paper = paper_objects[pmid] if mentions == None: mentions = [] query = VariantPaper.objects.filter( variant_hg38=variant_genomic_coordinate, paper=paper) if query.count() > 0: # we already have this variantpaper query.update(mentions=mentions, points=points, deleted=False) else: vp = VariantPaper( variant_hg38=variant_genomic_coordinate, paper=paper, points=points, mentions=mentions, deleted=False) vp.save()
def insert(self, datapoint: SerializableArticleRecord): self._validate_integrity_constraints(datapoint) if IgnoredPaper.objects.filter(doi=datapoint.doi).exists(): raise DatabaseUpdate.SkipArticle("DOI is on ignore list") conflict = False try: with transaction.atomic(): try: db_article = Paper.objects.get(doi=datapoint.doi) created = False except Paper.DoesNotExist: db_article = Paper(doi=datapoint.doi) created = True if not created: datasource_comparison = DataSource.compare( db_article.data_source_value, datapoint.datasource) if datasource_comparison > 0: datasource_name = DataSource( db_article.data_source_value).name raise DatabaseUpdate.SkipArticle( f"Article already tracked by {datasource_name}") elif not self.force_update and not self.update_existing and datasource_comparison == 0: raise DatabaseUpdate.SkipArticle( "Article already in database") changed_externally = db_article.scrape_hash != datapoint.md5 changed_internally = db_article.manually_modified if not self.force_update and not changed_externally: db_article.last_scrape = timezone.now() db_article.save() return db_article, False, False # Article was neither created, nor updated if changed_internally: conflict = True raise DatabaseUpdate.Error( "Conflict: Manual modification and external change" ) self._update(db_article, datapoint) except DatabaseUpdate.Error as ex: if conflict: self._handle_conflict(db_article, datapoint) raise ex return db_article, created, True # Article was updated
def index(request): global GDRIVE_DIR_ID bulk = BulkUploadForm() upl = UploadForm() if request.method == "POST": try: assert request.FILES.get('file', None) is not None # UploadForm is submitted upl = UploadForm(request.POST, request.FILES) if upl.is_valid(): path = STATICFILES_DIRS[0] uid = uuid.uuid4() path = os.path.join(path, "files", "{}.pdf".format(uid)) file = request.FILES.get('file') with open(path, 'wb+') as dest: for chunk in file.chunks(): dest.write(chunk) if not GDRIVE_DIR_ID: GDRIVE_DIR_ID = get_or_create_folder(GDRIVE_DIRNAME, public=True) paper = upl.save(commit=False) paper.link = upload_file(path, "{}.pdf".format(uid), folderId=GDRIVE_DIR_ID) keys_tmp = upl.cleaned_data.get("keywords") if upl.cleaned_data.get('custom_subject', '') != '': paper.subject = upl.cleaned_data.get('custom_subject') paper.save() for key in keys_tmp: paper.keywords.add(key) paper.save() LOG.info("New file uploaded: {}.pdf".format(uid)) messages.success(request, "File Upload Successful") try: del_key = request.POST.get('del_key', 0) key = int(del_key) if key > 0: PaperRequest.objects.filter(pk=key).delete() LOG.info("Request {} cleared".format(key)) except Exception as e: LOG.warning(e) os.remove(path) except AssertionError: if request.user.is_staff: # BulkUploadForm has been submitted bulk = BulkUploadForm(request.POST, request.FILES) processed = 0 saved = 0 if bulk.is_valid(): raw_papers = json.load(request.FILES.get('bulk_file')) for paper in raw_papers: processed += 1 dep_code = str(paper.get("Department", "Other")) if dep_code == "": dep_code = "Other" dep, _ = Department.objects\ .get_or_create(code=dep_code) p = Paper(department=dep, year=paper.get("Year", None), subject=paper.get("Paper", None), link=paper.get("Link", None), paper_type=paper.get("Semester", None)) try: p.save() saved += 1 except Exception as e: LOG.warning(e) LOG.info("%d entries processed, %d entries saved" % (processed, saved)) messages.success( request, "Bulk upload successful:\ {} entries saved".format(saved)) return render(request, "upload.html", { "bulk_form": bulk, "crowd_form": upl })