Пример #1
0
def update_score_inplace(abstr_list, relevant_docs,
      irrelevant_docs, mu_corpus=[]):

   """Update in place the 'score' field of JSON-like documents.

   The relevance score of a given abstract a is (math notation):
   max {cosine(a,r) | r relevant} - max {cosine(a,i) | i rrelevant}"""

   new_texts = [abstr['text'] for abstr in abstr_list]
   new_tfidfs = tfidf.compute_from_texts(new_texts, mu_corpus)

   for (doc, new_tfidf) in zip(abstr_list, new_tfidfs):
      cosine_relevant = [
            tfidf.cosine(new_tfidf, relevant_doc['tfidf']) \
            for relevant_doc in relevant_docs
      ]
      cosine_irrelevant = [
            tfidf.cosine(new_tfidf, irrelevant_doc['tfidf']) \
            for irrelevant_doc in irrelevant_docs
      ]
      doc['score'] = 10 * round(
             max(cosine_relevant) - max(cosine_irrelevant), 3
      )
Пример #2
0
   def post(self):
      # Who is it? Get it from the POST parameters.
      uid = self.request.get('uid')
      data = models.UserData.get_by_key_name(uid)
      # Check that POST is issued from PubCron mail.
      checksum = self.validate_request(data)
      if not self.request.get('checksum'):
         # Could not check identity (hacked?!!): good-bye.
         return

      # Identity check successful. Do the update.
      new_relevant_pmids = []
      new_irrelevant_pmids = []

      # Process key/value pairs.
      for name in self.request.arguments():
         # NB: only PMID update correspond to 'name' equal to
         # "Yes" or "No". The other cases are either no answer
         # or non PMID POST paramters (like uid or checksum).
         if self.request.get(name) == 'Yes':
            new_relevant_pmids += [name]
         elif self.request.get(name) == 'No':
            new_irrelevant_pmids += [name]

      # It is unlikely that a malicious request went
      # until here, but because we are about to save user-
      # submitted data, we do a validity (security) check.
      pmids_to_update = new_relevant_pmids + new_irrelevant_pmids
      if not self.validate_pmid(pmids_to_update):
         # Validation failed: good-bye.
         return

      # From here, PMIDs have been parsed and checked.
      # Now recall and parse user JSON data.
      mu_corpus = utils.decrypt(data, 'mu_corpus')
      relevant_docs = utils.decrypt(data, 'relevant_docs')
      irrelevant_docs = utils.decrypt(data, 'irrelevant_docs')

      # Clear new docs from user data (in case users are notifying
      # that they change their mind on relevance).
      pmids_to_update = new_relevant_pmids + new_irrelevant_pmids
      for relevant_then_irrelevant in (relevant_docs, irrelevant_docs):
         for doc in relevant_then_irrelevant:
            if doc.get('pmid') in pmids_to_update:
               relevant_then_irrelevant.remove(doc)


      # Now, get the PubMed data and compute tf-idf.
      for (new_ids, doc_list) in (
            (new_relevant_pmids, relevant_docs),
            (new_irrelevant_pmids, irrelevant_docs)):

         new_docs = eUtils.fetch_ids(new_ids)
         new_tfidf = tfidf.compute_from_texts(
             [abstr.get('text', '') for abstr in new_docs],
             mu_corpus.values()
         )
         for (doc, tfidf_dict) in zip (new_docs, new_tfidf):
            # Keep only fields 'pmid' and 'title'.
            for field_name in doc.keys():
               if not field_name in ('pmid', 'title'):
                  doc.pop(field_name, None)
            # Add field 'tfidf'.
            doc['tfidf'] = tfidf_dict
         # Append to user data.
         doc_list.extend(new_docs)
         and_finally_remove_junk_from(doc_list)


      # Update the documents...
      data.relevant_docs = zlib.compress(json.dumps(relevant_docs))
      data.irrelevant_docs = zlib.compress(json.dumps(irrelevant_docs))
      # ... and put.
      data.put()

      # Now reassure the user.
      self.response.out.write(utils.render('feedback.html'))