예제 #1
0
 def _compute_updatable_papers(self, papers):
     """
     Computes which of the papers from the import will be touched for (re-)creation and creates model instances
     (without saving), if necessary.
     Returns a list of dicts of size len(papers) of format {db_paper, will_update}.
     db_paper=None indicates an error (possibly with the publication date), so the paper won't be created/updated.
     """
     paper_informations = []
     for i, paper in enumerate(papers):
         if not paper["published_at"]:
             self.log(f"Not importing {paper['doi']} because the date is missing.")
             paper_informations.append({"db_paper": None, "will_update": False})
             continue
         try:
             db_paper = Paper.objects.get(doi=paper["doi"])
             if DataSource.compare(db_paper.data_source_value, paper["datasource_id"]) >= 0:
                 paper_informations.append({"db_paper": db_paper, "will_update": False})
                 continue
             else:
                 # delete db_paper and recreate -> easier to handle using bulk create
                 db_paper.delete()
                 db_paper = Paper(doi=paper["doi"])
         except Paper.DoesNotExist:
             db_paper = Paper(doi=paper["doi"])
         paper_informations.append({"db_paper": db_paper, "will_update": True})
     return paper_informations
예제 #2
0
    def handle(self, *args, **options):
        literature_results = json.load(options['literature'])
        variants_found_in_papers = literature_results['variants']
        papers = literature_results['papers']
        crawl_date = literature_results['date']

        # Soft delete all existing records (they will be undeleted if they're
        # in the new data)
        existing_papers = Paper.objects.all()
        existing_papers.update(deleted=True)
        existing_variant_papers = VariantPaper.objects.all()
        existing_variant_papers.update(deleted=True)

        paper_objects = {}
        for pmid, paper in papers.iteritems():
            query = Paper.objects.filter(pmid=pmid)
            if query.count() > 0:
                # we already have this paper in the database
                paper_objects[pmid] = query[0]
                query.update(deleted=False, crawl_date=crawl_date)
            else:
                if not paper['year']:
                    paper['year'] = '0000'
                p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \
                        keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \
                        deleted=False, pmid=paper['pmid'], crawl_date=crawl_date)
                p.save()
                paper_objects[pmid] = p

        for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems(
        ):
            for variant in variant_instances:
                pmid = variant['pmid']
                points = variant['points']
                mentions = variant['mentions']
                if pmid in paper_objects:
                    paper = paper_objects[pmid]
                    if mentions == None:
                        mentions = []
                    query = VariantPaper.objects.filter(
                        variant_hg38=variant_genomic_coordinate, paper=paper)
                    if query.count() > 0:
                        # we already have this variantpaper
                        query.update(mentions=mentions,
                                     points=points,
                                     deleted=False)
                    else:
                        vp = VariantPaper(
                            variant_hg38=variant_genomic_coordinate,
                            paper=paper,
                            points=points,
                            mentions=mentions,
                            deleted=False)
                        vp.save()
예제 #3
0
    def insert(self, datapoint: SerializableArticleRecord):
        self._validate_integrity_constraints(datapoint)

        if IgnoredPaper.objects.filter(doi=datapoint.doi).exists():
            raise DatabaseUpdate.SkipArticle("DOI is on ignore list")

        conflict = False
        try:
            with transaction.atomic():
                try:
                    db_article = Paper.objects.get(doi=datapoint.doi)
                    created = False
                except Paper.DoesNotExist:
                    db_article = Paper(doi=datapoint.doi)
                    created = True

                if not created:
                    datasource_comparison = DataSource.compare(
                        db_article.data_source_value, datapoint.datasource)
                    if datasource_comparison > 0:
                        datasource_name = DataSource(
                            db_article.data_source_value).name
                        raise DatabaseUpdate.SkipArticle(
                            f"Article already tracked by {datasource_name}")
                    elif not self.force_update and not self.update_existing and datasource_comparison == 0:
                        raise DatabaseUpdate.SkipArticle(
                            "Article already in database")

                    changed_externally = db_article.scrape_hash != datapoint.md5
                    changed_internally = db_article.manually_modified

                    if not self.force_update and not changed_externally:
                        db_article.last_scrape = timezone.now()
                        db_article.save()
                        return db_article, False, False  # Article was neither created, nor updated

                    if changed_internally:
                        conflict = True
                        raise DatabaseUpdate.Error(
                            "Conflict: Manual modification and external change"
                        )

                self._update(db_article, datapoint)
        except DatabaseUpdate.Error as ex:
            if conflict:
                self._handle_conflict(db_article, datapoint)
            raise ex

        return db_article, created, True  # Article was updated
예제 #4
0
    def handle(self, *args, **options):
        literature_results = json.load(options['literature'])
        variants_found_in_papers = literature_results['variants']
        papers = literature_results['papers']
        crawl_date = literature_results['date']

        # Soft delete all existing records (they will be undeleted if they're
        # in the new data)
        existing_papers = Paper.objects.all()
        existing_papers.update(deleted=True)
        existing_variant_papers = VariantPaper.objects.all()
        existing_variant_papers.update(deleted=True)

        paper_objects = {}
        for pmid, paper in papers.iteritems():
            query = Paper.objects.filter(pmid=pmid)
            if query.count() > 0:
                # we already have this paper in the database
                paper_objects[pmid] = query[0]
                query.update(deleted=False, crawl_date=crawl_date)
            else:
                if not paper['year']:
                    paper['year'] = '0000'
                p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \
                        keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \
                        deleted=False, pmid=paper['pmid'], crawl_date=crawl_date)
                p.save()
                paper_objects[pmid] = p

        for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems():
            for variant in variant_instances:
                pmid = variant['pmid']
                points = variant['points']
                mentions = variant['mentions']
                if pmid in paper_objects:
                    paper = paper_objects[pmid]
                    if mentions == None:
                        mentions = []
                    query = VariantPaper.objects.filter(variant_hg38=variant_genomic_coordinate, paper=paper)
                    if query.count() > 0:
                        # we already have this variantpaper
                        query.update(mentions=mentions, points=points, deleted=False)
                    else:
                        vp = VariantPaper(variant_hg38=variant_genomic_coordinate, paper=paper, points=points, mentions=mentions, deleted=False)
                        vp.save()
예제 #5
0
    def _validate_integrity_constraints(datapoint: SerializableArticleRecord):
        error = None
        error_msg = None
        if not datapoint.doi:
            error = "Missing DOI"
        elif not datapoint.title:
            error = "Missing title"
        elif len(datapoint.title) > Paper.max_length('title'):
            error = "Title too long"
            error_msg = error + f": {datapoint.title}"
        elif not datapoint.abstract:
            error = "Missing abstract"
        elif not datapoint.publication_date:
            error = "Missing publication date"

        if datapoint.doi and '\n' in datapoint.doi:
            error = "DOI has line breaks"

        author_count = 0
        for author in datapoint.authors:
            if ((author[1]
                 and len(author[1]) > Author.max_length("first_name"))
                    or (author[0]
                        and len(author[0]) > Author.max_length("last_name"))):
                error = "Author name too long"
                error_msg = error + f": {author[0]}, {author[1]}"
            if not AuthorNameResolution.objects.filter(
                    source_first_name=author[1],
                    source_last_name=author[0],
                    target_author=None).exists():
                # Count only authors that are not on the author ignore list
                author_count += 1

        if author_count == 0:
            error = "No authors"

        if error:
            if not error_msg:
                error_msg = error
            raise DatabaseUpdate.Error(error_msg)
예제 #6
0
파일: views.py 프로젝트: skkarn21/iqps
def index(request):
    global GDRIVE_DIR_ID

    bulk = BulkUploadForm()
    upl = UploadForm()
    if request.method == "POST":
        try:
            assert request.FILES.get('file', None) is not None
            # UploadForm is submitted
            upl = UploadForm(request.POST, request.FILES)
            if upl.is_valid():
                path = STATICFILES_DIRS[0]
                uid = uuid.uuid4()
                path = os.path.join(path, "files", "{}.pdf".format(uid))
                file = request.FILES.get('file')
                with open(path, 'wb+') as dest:
                    for chunk in file.chunks():
                        dest.write(chunk)
                if not GDRIVE_DIR_ID:
                    GDRIVE_DIR_ID = get_or_create_folder(GDRIVE_DIRNAME,
                                                         public=True)
                paper = upl.save(commit=False)
                paper.link = upload_file(path,
                                         "{}.pdf".format(uid),
                                         folderId=GDRIVE_DIR_ID)
                keys_tmp = upl.cleaned_data.get("keywords")
                if upl.cleaned_data.get('custom_subject', '') != '':
                    paper.subject = upl.cleaned_data.get('custom_subject')
                paper.save()

                for key in keys_tmp:
                    paper.keywords.add(key)

                paper.save()
                LOG.info("New file uploaded: {}.pdf".format(uid))
                messages.success(request, "File Upload Successful")
                try:
                    del_key = request.POST.get('del_key', 0)
                    key = int(del_key)
                    if key > 0:
                        PaperRequest.objects.filter(pk=key).delete()
                    LOG.info("Request {} cleared".format(key))
                except Exception as e:
                    LOG.warning(e)

                os.remove(path)

        except AssertionError:
            if request.user.is_staff:
                # BulkUploadForm has been submitted
                bulk = BulkUploadForm(request.POST, request.FILES)
                processed = 0
                saved = 0
                if bulk.is_valid():
                    raw_papers = json.load(request.FILES.get('bulk_file'))
                    for paper in raw_papers:
                        processed += 1
                        dep_code = str(paper.get("Department", "Other"))
                        if dep_code == "":
                            dep_code = "Other"

                        dep, _ = Department.objects\
                            .get_or_create(code=dep_code)

                        p = Paper(department=dep,
                                  year=paper.get("Year", None),
                                  subject=paper.get("Paper", None),
                                  link=paper.get("Link", None),
                                  paper_type=paper.get("Semester", None))
                        try:
                            p.save()
                            saved += 1
                        except Exception as e:
                            LOG.warning(e)

                        LOG.info("%d entries processed, %d entries saved" %
                                 (processed, saved))
                    messages.success(
                        request, "Bulk upload successful:\
                                     {} entries saved".format(saved))

    return render(request, "upload.html", {
        "bulk_form": bulk,
        "crowd_form": upl
    })
예제 #7
0
    def _update(self, db_article: Paper, datapoint: SerializableArticleRecord):
        db_article.title = datapoint.title
        db_article.abstract = datapoint.abstract
        db_article.published_at = datapoint.publication_date

        db_article.url = datapoint.url
        db_article.pdf_url = datapoint.pdf_url
        db_article.is_preprint = datapoint.is_preprint
        db_article.pubmed_id = datapoint.pubmed_id
        db_article.data_source_value = self.datasource
        db_article.covid_related = covid_related(db_article=db_article)

        if self.datasource.check_covid_related and not db_article.covid_related:
            raise DatabaseUpdate.Error("Article not covid related.")

        db_article.host, _ = PaperHost.objects.get_or_create(
            name=datapoint.paperhost)

        db_article.visualized = False
        db_article.vectorized = False
        db_article.save()

        AuthorPaperMembership.objects.filter(paper=db_article).delete()
        rank = 0
        for author in datapoint.authors:
            db_author, _ = Author.get_or_create_by_name(first_name=author[1],
                                                        last_name=author[0])
            if db_author is not None:
                AuthorPaperMembership.objects.create(paper=db_article,
                                                     author=db_author,
                                                     rank=rank)
                rank += 1

        if datapoint.journal:
            db_article.journal, _ = Journal.objects.get_or_create(
                name=datapoint.journal[:Journal.max_length("name")])

        db_article.version = datapoint.version
        db_article.last_scrape = timezone.now()

        db_article.categories.clear()
        db_article.scrape_hash = datapoint.md5
        db_article.save()
예제 #8
0
    def _import_papers(self, papers, paper_informations, authors,
                       import_locations, import_ml_categories, import_journals, tar):
        """
        Import papers and its associated authors. Also its relations with locations, categories and journals,
        depending on the bool parameters.
        The mapping of all things (except authors) must have been built before using this.
        """
        paper_title_max_len = Paper.max_length("title")
        author_firstname_max_len = Author.max_length("first_name")
        author_lastname_max_len = Author.max_length("last_name")

        papers_to_add = []
        category_memberships_to_create = []
        location_memberships_to_create = []

        for i, (paper, paper_info) in enumerate(zip(papers, paper_informations)):
            db_paper = paper_info["db_paper"]
            if not db_paper:
                continue

            if paper_info["will_update"]:
                db_paper.title = paper["title"][:paper_title_max_len]
                db_paper.abstract = paper["abstract"]
                db_paper.data_source_value = paper["datasource_id"]
                db_paper.version = paper["version"]
                db_paper.covid_related = paper["covid_related"]
                db_paper.url = paper["url"]
                db_paper.pdf_url = paper["pdf_url"]
                db_paper.is_preprint = paper["is_preprint"]
                db_paper.published_at = paper["published_at"]

                db_paper.last_scrape = make_aware(
                    datetime.strptime(paper["last_scrape"], "%Y-%m-%d %H:%M:%S")
                ) if paper["last_scrape"] else None

                if self.export_version > 4:
                    db_paper.scrape_hash = paper["scrape_hash"]
                if self.export_version > 5:
                    db_paper.manually_modified = paper["manually_modified"]
                db_paper.host = self._mappings.paperhost_mapping[paper["paperhost_id"]] if paper[
                    "paperhost_id"] else None
                db_paper.pubmed_id = paper["pubmed_id"] if "pubmed_id" in paper else None
                db_paper.journal = (
                    self._mappings.journal_mapping[paper["journal_id"]] if import_journals and paper[
                        "journal_id"] else None
                )
                db_paper.data = self._mappings.paperdata_mapping[
                    db_paper.doi] if db_paper.doi in self._mappings.paperdata_mapping else None

                if self.export_version >= 4:
                    db_paper.visualized = paper["visualized"]
                    db_paper.vectorized = paper["vectorized"]

                img_path = paper["image"]
                if img_path:
                    with tar.extractfile(img_path) as img_file:
                        image = Image.open(img_file)
                        buffer = BytesIO()
                        image.save(buffer, format="JPEG")
                        db_paper.add_preview_image(buffer, save=False)

                papers_to_add.append(db_paper)
                self.statistics.added_papers += 1

                self._mappings.doi_to_author_mapping[db_paper.doi] = []  # maps doi to a list of its db_authors

                for author_id in paper["author_ids"]:
                    author = authors[author_id]
                    author_tuple = (author["firstname"][:author_firstname_max_len],
                                    author["lastname"][:author_lastname_max_len])
                    try:
                        db_author = Author.objects.get(first_name=author["firstname"][:author_firstname_max_len],
                                                       last_name=author["lastname"][:author_lastname_max_len])
                        self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": False}
                    except Author.DoesNotExist:
                        if author_tuple in self._mappings.db_author_mapping:
                            # author was already requested earlier
                            db_author = self._mappings.db_author_mapping[author_tuple]["db_author"]
                        else:
                            db_author = Author(first_name=author["firstname"][:author_firstname_max_len],
                                               last_name=author["lastname"][:author_lastname_max_len])
                            self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": True}
                            self.statistics.authors_created += 1
                    self._mappings.doi_to_author_mapping[db_paper.doi].append(db_author)

            if import_ml_categories and not db_paper.categories.exists():
                # Set paper categories if they were not set (even on existing papers)
                if paper["category_memberships"]:
                    self.statistics.papers_w_new_category += 1
                for category in paper["category_memberships"]:
                    membership = CategoryMembership(paper=db_paper,
                                                    category=self._mappings.category_mapping[category["identifier"]],
                                                    score=category["score"])
                    category_memberships_to_create.append(membership)

            if import_locations and not db_paper.locations.exists():
                # Set paper locations if they were not set (even on existing papers)
                if paper["locations"]:
                    self.statistics.papers_w_new_location += 1
                    db_paper.location_modified = paper["location_modified"]
                for location in paper["locations"]:
                    membership = GeoLocationMembership(paper=db_paper,
                                                       location=self._mappings.location_mapping[location["id"]],
                                                       state=location["state"])
                    location_memberships_to_create.append(membership)

        Paper.objects.bulk_create(papers_to_add)
        Author.objects.bulk_create([author["db_author"] for author in self._mappings.db_author_mapping.values()
                                    if author["created"]])
        CategoryMembership.objects.bulk_create(category_memberships_to_create)
        GeoLocationMembership.objects.bulk_create(location_memberships_to_create)

        author_paper_memberships = []
        for doi, authors in self._mappings.doi_to_author_mapping.items():
            author_paper_memberships += [AuthorPaperMembership(paper_id=doi, author_id=author.pk, rank=i)
                                         for i, author in enumerate(authors)]
        AuthorPaperMembership.objects.bulk_create(author_paper_memberships)
        # recompute counts because post save signals are not triggered on bulk create
        GeoLocation.recompute_counts(GeoCity.objects.all(), GeoCountry.objects.all())