def assign_jxn_potential(article): if article.articlefulltext_set.all().count() > 0: full_text_ob = article.articlefulltext_set.all()[0] full_text = full_text_ob.get_content() methods_tag = getMethodsTag(full_text, article) if methods_tag is None: print (article.pmid, article.title, article.journal) else: text = re.sub('\s+', ' ', methods_tag.text) sents = nltk.sent_tokenize(text) jxn_pot_set = set() for s in sents: if jxn_not_re.findall(s): jxn_pot_set.add('Not corrected') elif jxn_re.findall(s): jxn_pot_set.add('Corrected') if 'Corrected' in jxn_pot_set: metadata_ob = m.MetaData.objects.get_or_create(name='JxnPotential', value='Corrected')[0] update_amd_obj(article, metadata_ob) if 'Not corrected' in jxn_pot_set: metadata_ob = m.MetaData.objects.get_or_create(name='JxnPotential', value='Not corrected')[0] update_amd_obj(article, metadata_ob) aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = full_text_ob)[0] aftStatOb.methods_tag_found = True aftStatOb.save()
def assign_rec_temp(article): # find a sentence that mentions recording and temperature or degree celsius full_text_ob = article.articlefulltext_set.all()[0] ft = full_text_ob.get_content() methods_tag = getMethodsTag(ft, article) if methods_tag is None: print (article.pmid, article.title, article.journal) else: text = re.sub('\s+', ' ', methods_tag.text) temp_dict_list = [] sents = nltk.sent_tokenize(text) for s in sents: # print s.encode("iso-8859-15", "replace") if celsius_re.findall(s): # print article.pk # print s.encode("iso-8859-15", "replace") degree_ind = s.rfind(u'°C') min_sent_ind = 0 max_sent_ind = len(s) degree_close_str = s[np.maximum(min_sent_ind, degree_ind-20):np.minimum(max_sent_ind, degree_ind+1)] retDict = resolveDataFloat(degree_close_str) if 'value' in retDict: temp_dict_list.append(retDict) elif room_temp_re.findall(s): # print article.pk # print s.encode("iso-8859-15", "replace") retDict = {'value':22.0, 'maxRange' : 24.0, 'minRange': 20.0} temp_dict_list.append(retDict) if len(temp_dict_list) > 0: # print temp_dict_list temp_dict_fin = validate_temp_list(temp_dict_list) # print temp_dict_fin if temp_dict_fin: min_range = None max_range = None stderr = None if 'minRange' in temp_dict_fin: min_range = temp_dict_fin['minRange'] if 'maxRange' in temp_dict_fin: max_range = temp_dict_fin['maxRange'] if 'error' in temp_dict_fin: stderr = temp_dict_fin['error'] cont_value_ob = m.ContValue.objects.filter(mean = temp_dict_fin['value'], min_range = min_range, max_range = max_range, stderr = stderr)[0] if not cont_value_ob: cont_value_ob = m.ContValue.objects.get_or_create(mean = temp_dict_fin['value'], min_range = min_range, max_range = max_range, stderr = stderr)[0] metadata_ob = m.MetaData.objects.get_or_create(name='RecTemp', cont_value=cont_value_ob)[0] update_amd_obj(article, metadata_ob) aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = full_text_ob)[0] aftStatOb.methods_tag_found = True aftStatOb.save()
def assign_prep_type(article): metadata_added = False if article.articlefulltext_set.all().count() > 0: full_text_ob = article.articlefulltext_set.all()[0] full_text = full_text_ob.get_content() methods_tag = getMethodsTag(full_text, article) if methods_tag is None: print (article.pmid, article.title, article.journal) else: text = re.sub('\s+', ' ', methods_tag.text) sents = nltk.sent_tokenize(text) prep_type_set = set() for s in sents: if culture_re.findall(s): prep_type_set.add('cell culture') if in_vitro_re.findall(s): prep_type_set.add('in vitro') if in_vivo_re.findall(s): prep_type_set.add('in vivo') if model_re.findall(s): prep_type_set.add('model') if 'cell culture' in prep_type_set: metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='cell culture')[0] update_amd_obj(article, metadata_ob) metadata_added = True if 'in vitro' in prep_type_set: metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='in vitro')[0] update_amd_obj(article, metadata_ob) metadata_added = True if 'in vivo' in prep_type_set: metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='in vivo')[0] update_amd_obj(article, metadata_ob) metadata_added = True # if 'model' in prep_type_set: # metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='model', added_by = robot_user)[0] # article.metadata.add(metadata_ob) # metadata_added = True aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = full_text_ob)[0] aftStatOb.methods_tag_found = True aftStatOb.save() if metadata_added == False: mesh_terms = article.terms.all() if culture_mesh in mesh_terms: metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='cell culture')[0] update_amd_obj(article, metadata_ob) if in_silico_mesh in mesh_terms: metadata_ob = m.MetaData.objects.get_or_create(name='PrepType', value='model')[0] update_amd_obj(article, metadata_ob)
def assign_electrode_type(article): metadata_added = False if article.articlefulltext_set.all().count() > 0: full_text_ob = article.articlefulltext_set.all()[0] full_text = full_text_ob.get_content() methods_tag = getMethodsTag(full_text, article) if methods_tag is None: print (article.pmid, article.title, article.journal) else: text = re.sub('\s+', ' ', methods_tag.text) sents = nltk.sent_tokenize(text) electrode_set = set() for s in sents: if whole_re.findall(s): # wholeCellSet.add(art) # print 'whole: ' + art.title # print str(idx) + ' : ' + s.encode("iso-8859-15", "replace") electrode_set.add('Patch-clamp') # electrode_list.append('Whole-cell') # electrode_list_text_mine.append('Whole-cell') if sharp_re.findall(s): # sharpSet.add(art) # print 'sharp: ' + art.title # print str(idx) + ' : ' + s.encode("iso-8859-15", "replace") electrode_set.add('Sharp') if 'Patch-clamp' in electrode_set: metadata_ob = m.MetaData.objects.get_or_create(name='ElectrodeType', value='Patch-clamp')[0] update_amd_obj(article, metadata_ob) metadata_added = True if 'Sharp' in electrode_set: metadata_ob = m.MetaData.objects.get_or_create(name='ElectrodeType', value='Sharp')[0] update_amd_obj(article, metadata_ob) metadata_added = True aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = full_text_ob)[0] aftStatOb.methods_tag_found = True aftStatOb.save() if metadata_added == False: mesh_terms = article.terms.all() if patch_mesh in mesh_terms: metadata_ob = m.MetaData.objects.get_or_create(name='ElectrodeType', value='Patch-clamp')[0] update_amd_obj(article, metadata_ob) metadata_added = True
def assign_solution_concs(article): # print "Textmining article: %s" % article.pk full_text_list = m.ArticleFullText.objects.filter(article = article.pk) if not full_text_list: return -1 full_text = full_text_list[0].get_content() methods_tag = getMethodsTag(full_text, article) if methods_tag is None: print "No methods tag found article id: %s, pmid: %s" % (article.pk, article.pmid) return -2 article_text = re.sub('\s+', ' ', methods_tag.text) if len(article_text) <= 100: print "Methods section is too small. Article id: %s, pmid: %s" % (article.pk, article.pmid) return -3 return 1 sentences = nltk.sent_tokenize(article_text) list_of_solns = [] wrap_soln_text = [] # Consider a machine learning approach to get the weights, also assign higher score when compounds are in close proximity to avoid: # "The calcium-free saline solution containing cobalt was composed of (in mM): 115 NaCl, 23 NaHCO3, 3.1 KCl, 1.15 CoCl2, 1.2 MgCl2, and 6 glucose." # "The extracellular solution to isolate calcium current utilizing Ba2+ as a charge carrier contained (mm): tetraethylammonium chloride 120, BaCl2 10, MgCl2 1, Hepes 10, and glucose 10, pH adjusted to 7.3 with Tris." for i, sentence in enumerate(sentences): matchScore = 0 if conc_re.search(sentence): matchScore += 3 if mgca_re.search(sentence): matchScore += 2 if na_re.search(sentence): matchScore += 1 if k_re.search(sentence): matchScore += 1 if cl_re.search(sentence): matchScore += 2 if matchScore >= 7: list_of_solns.append(sentence) if i < len(sentences) - 1: current_text_wrap = get_preceeding_text(sentences, i) current_text_wrap.append(sentences[i+1]) else: current_text_wrap = get_preceeding_text(sentences, i) current_text_wrap.append("") wrap_soln_text.append(current_text_wrap) recording_solution_absent = True storage_solns = [] unassigned_solns = [] internalID = 0 externalID = 0 for i, soln in enumerate(list_of_solns): for j in range(-1, len(wrap_soln_text[i])): if j == -1: soln_id_text = soln else: soln_id_text = wrap_soln_text[i][j] if pipette_re.search(soln_id_text): if other_re.search(soln_id_text): break record_compounds(article, soln, wrap_soln_text[i], "internal_%s" % internalID) internalID += 1 break elif record_re.search(soln_id_text): if other_re.search(soln_id_text) and not recording_solution_absent: break recording_solution_absent = False record_compounds(article, soln, wrap_soln_text[i], "external_%s" % externalID) externalID += 1 break elif cutstore_re.search(soln_id_text): storage_solns.append([soln, wrap_soln_text[i]]) break elif j == len(wrap_soln_text[i]) - 1: unassigned_solns.append([soln, wrap_soln_text[i]]) if recording_solution_absent and storage_solns: recording_solution_absent = False soln = storage_solns.pop() record_compounds(article, soln[0], soln[1], "external_%s" % externalID) # if recording_solution_absent and unassigned_solns: # recording_solution_absent = False # for soln in unassigned_solns: # record_compounds(article, soln, wrap_soln_text[i], "unassigned_%s" % externalID) # externalID += 1 flag_soln = 3 if externalID == 1 and internalID == 1 and len(unassigned_solns) == 0: flag_soln = 0 elif externalID == 1 and internalID > 1 and len(unassigned_solns) == 0: flag_soln = 1 elif externalID == 1 and internalID > 1 and len(unassigned_solns) > 0: flag_soln = 2 flag_soln_ob = m.ContValue.objects.get_or_create(mean = flag_soln, stderr = 0, stdev = 0)[0] flag_soln_meta_ob = m.MetaData.objects.get_or_create(name = "FlagSoln", cont_value = flag_soln_ob)[0] update_amd_obj(article, flag_soln_meta_ob) return 1
def assign_animal_age(article): # TODO: find a sentence that mentions recording and temperature or degree celsius full_text_ob = article.articlefulltext_set.all()[0] ft = full_text_ob.get_content() methods_tag = getMethodsTag(ft, article) if methods_tag is None: print (article.pmid, article.title, article.journal) else: text = re.sub('\s+', ' ', methods_tag.text) age_dict_list = [] sents = nltk.sent_tokenize(text) for s in sents: # print s.encode("iso-8859-15", "replace") if p_age_re.findall(s): # print article.pk # print s.encode("iso-8859-15", "replace") # print 'Pnumber' p_iter = re.finditer(ur'P\d', s) matches = [(match.start(0), match.end(0)) for match in p_iter] if len(matches) > 0: p_ind = matches[-1][0] # p_ind = s.rfind(ur'P\d') min_sent_ind = 0 max_sent_ind = len(s) p_close_str = s[np.maximum(min_sent_ind, p_ind-15):np.minimum(max_sent_ind, p_ind+15)] # print p_close_str p_close_str = p_close_str.translate(dict((ord(c), u'') for c in string.ascii_letters)).strip() # print p_close_str retDict = resolveDataFloat(p_close_str) # print retDict if 'value' in retDict: age_dict_list.append(retDict) elif day_re.findall(s): # print article.pk # print s.encode("iso-8859-15", "replace") # print 'day' p_iter = re.finditer(ur'\sday', s) matches = [(match.start(0), match.end(0)) for match in p_iter] if len(matches) > 0: p_ind = matches[-1][0] # p_ind = s.rfind(ur'P\d') min_sent_ind = 0 max_sent_ind = len(s) p_close_str = s[np.maximum(min_sent_ind, p_ind-15):np.minimum(max_sent_ind, p_ind+15)] # print p_close_str p_close_str = p_close_str.translate(dict((ord(c), u'') for c in string.ascii_letters)).strip() # print p_close_str retDict = resolveDataFloat(p_close_str) # print retDict if 'value' in retDict: age_dict_list.append(retDict) if len(age_dict_list) > 0: # print temp_dict_list # print age_dict_list age_dict_fin = validate_age_list(age_dict_list) # print age_dict_fin if age_dict_fin: min_range = None max_range = None stderr = None if 'minRange' in age_dict_fin: min_range = age_dict_fin['minRange'] if 'maxRange' in age_dict_fin: max_range = age_dict_fin['maxRange'] if 'error' in age_dict_fin: stderr = age_dict_fin['error'] cont_value_ob = m.ContValue.objects.get_or_create(mean = age_dict_fin['value'], min_range = min_range, max_range = max_range, stderr = stderr)[0] metadata_ob = m.MetaData.objects.get_or_create(name='AnimalAge', cont_value=cont_value_ob)[0] update_amd_obj(article, metadata_ob) aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = full_text_ob)[0] aftStatOb.methods_tag_found = True aftStatOb.save()