def count_automated_database_statistics(): nedmsValid = m.NeuronEphysDataMap.objects.filter(neuron_concept_map__times_validated__gte = 1, ephys_concept_map__times_validated__gte = 1, neuron_concept_map__source__data_table__isnull = False).distinct() articles_automated = m.Article.objects.filter(datatable__datasource__neuronconceptmap__times_validated__gte = 1).distinct() robot_user = m.get_robot_user() neurons = m.Neuron.objects.filter(neuronconceptmap__neuronephysdatamap__in = nedmsValid).distinct() ecmsNotValid = m.EphysConceptMap.objects.filter(times_validated = 0).distinct() ecms_valid_total = m.EphysConceptMap.objects.filter(times_validated = 1, source__data_table__isnull = False).distinct() ecms_valid_robot = m.EphysConceptMap.objects.filter(times_validated = 1,added_by = robot_user, source__data_table__isnull = False).distinct() ncms_robot_id, ncms_robot_3_id, ncms_datatable_total = count_matching_neuron_mentions() stat_dict = {} stat_dict['num_neurons'] = neurons.count() stat_dict['num_nemds_valid'] = nedmsValid.count() stat_dict['num_articles'] = articles_automated.count() stat_dict['num_ecms_valid_total'] = ecms_valid_total.count() stat_dict['num_ecms_valid_robot'] = ecms_valid_robot.count() stat_dict['ncms_datatable_total'] = ncms_datatable_total stat_dict['ncms_robot_id'] = ncms_robot_id stat_dict['ncms_robot_3_id'] = ncms_robot_3_id return stat_dict
def count_database_statistics(): nedmsValid = m.NeuronEphysDataMap.objects.filter(neuron_concept_map__times_validated__gte = 1, ephys_concept_map__times_validated__gte = 1).distinct() nedmsValidUser = m.NeuronEphysDataMap.objects.filter(neuron_concept_map__times_validated__gte = 1, ephys_concept_map__times_validated__gte = 1, neuron_concept_map__source__user_submission__isnull = False).distinct() articles = m.Article.objects.filter(Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1) | Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1)).distinct() articles_user_submit = m.Article.objects.filter(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1).distinct() journals = m.Journal.objects.filter(article__in = articles).distinct() robot_user = m.get_robot_user() neurons = m.Neuron.objects.filter(neuronconceptmap__neuronephysdatamap__in = nedmsValid).distinct() ephys_props = m.EphysProp.objects.filter(ephysconceptmap__neuronephysdatamap__in = nedmsValid).distinct() ecmsNotValid = m.EphysConceptMap.objects.filter(times_validated = 0).distinct() articles_not_validated_total = m.Article.objects.filter(datatable__datasource__ephysconceptmap__in = ecmsNotValid) articles_not_validated = articles_not_validated_total.annotate(ecm_count = Count('datatable__datasource__ephysconceptmap')) articles_not_validated = articles_not_validated.filter(ecm_count__gte = 4).distinct() ecms_valid_total = m.EphysConceptMap.objects.filter(times_validated = 1).distinct() ecms_valid_robot = m.EphysConceptMap.objects.filter(times_validated = 1,added_by = robot_user).distinct() ncms_robot_id, ncms_robot_3_id, ncms_datatable_total = count_matching_neuron_mentions() stat_dict = {} stat_dict['num_neurons'] = neurons.count() stat_dict['num_journals'] = journals.count() stat_dict['num_ephys_props'] = ephys_props.count() stat_dict['num_nemds_valid'] = nedmsValid.count() stat_dict['num_nemds_valid_user'] = nedmsValidUser.count() stat_dict['num_articles'] = articles.count() stat_dict['num_articles_user_submit'] = articles_user_submit.count() stat_dict['num_articles_unvalid'] = articles_not_validated.count() stat_dict['num_ecms_valid_total'] = ecms_valid_total.count() stat_dict['num_ecms_valid_robot'] = ecms_valid_robot.count() stat_dict['ncms_datatable_total'] = ncms_datatable_total stat_dict['ncms_robot_id'] = ncms_robot_id stat_dict['ncms_robot_3_id'] = ncms_robot_3_id return stat_dict
def assocArticleNeuron(artOb): robot_user = m.get_robot_user() fullTextOb = artOb.articlefulltext_set.all()[0] fullTextHtml = fullTextOb.get_content() if fullTextHtml == 'test': return soup = bs(''.join(fullTextHtml)) full_text = soup.get_text() neuronTuple = findNeuronsInText(full_text) usedNeurons = [] for t in neuronTuple: neuronOb = t[0] numMentions = t[2] if neuronOb not in usedNeurons and numMentions > 2: #neuronSynOb = t[1] neuronArticleMapOb = m.NeuronArticleMap.objects.get_or_create(neuron = neuronOb, num_mentions = numMentions, article = artOb, added_by = robot_user)[0] usedNeurons.append(neuronOb) else: continue aftStatOb = m.ArticleFullTextStat.objects.get_or_create(article_full_text = fullTextOb)[0] aftStatOb.neuron_article_map_processed = True aftStatOb.save()
def update_ecm_using_text_mining(ecm, ephys_synonym_list=None, verbose_output=True): """Updates an EphysConceptMap object using text mining rules Args: ecm: an EphysConceptMap object for the object to be updated ephys_synonym_list: the list of strings representing ephys synonyms verbose_output: a bool indicating whether function should print statements """ if not ephys_synonym_list: ephysSyns = m.EphysPropSyn.objects.all() ephys_synonym_list = [e.term.lower() for e in ephysSyns] if not ecm.ref_text: # some light error checking to make sure there's some text for the ecm object return # get the closest matching ephys prop given the table header reference text matched_ephys_prop = match_ephys_header(ecm.ref_text, ephys_synonym_list) if matched_ephys_prop is None: # no ephys props matched if verbose_output: print 'deleting %s, prop: %s' % (ecm.ref_text, ecm.ephys_prop) ecm.delete() # remove the EphysConceptMap since none of the updated EphysProps matched it elif matched_ephys_prop != ecm.ephys_prop: # different found prop than existing one if verbose_output: print 'changing %s, to prop: %s, from prop: %s' %(ecm.ref_text, matched_ephys_prop, ecm.ephys_prop) ecm.ephys_prop = matched_ephys_prop # update the ecm ecm.changed_by = m.get_robot_user() ecm.save()
def annotate_misnormalized_nedm(nedm): ''' if can't algorithmically normalize nedm value to something appropriate, and raw value is out of range, leave a note in corresponding ecm in table''' norm_dict = normalize_nedm_val(nedm) if norm_dict['value'] is None and check_data_val_range(nedm.val, nedm.ephys_concept_map.ephys_prop) is False: ecm = nedm.ephys_concept_map normalizing_failed_note = 'Parsing failed to normalize ephys data' if not ecm.note: ecm.note = normalizing_failed_note ecm.changed_by = m.get_robot_user() ecm.save() print 'adding failed normalizing note to %s with data table id %d' % (ecm.ephys_prop, ecm.source.data_table.pk)
def assign_old_article_metadata_maps(): with open ('data/old_article_metadata_maps.txt', 'r') as f: content = f.readlines() num_amdms = len(content) print 'repopulating %d article metadata maps' % num_amdms robot_user = m.get_robot_user() for i,line in enumerate(content): prog(i, num_amdms) [art_pk_str, md_pk_str] = re.findall('\d+', line) # print (art_pk_str, md_pk_str) # print line a = m.Article.objects.get(pk = int(art_pk_str)) md = m.MetaData.objects.get(pk = int(md_pk_str)) amdm = m.ArticleMetaDataMap.objects.get_or_create(article = a, metadata = md, added_by = robot_user)[0]
def update_data_table_stat(data_table_object): """adds intermediate fields to data table stat object based on concept map objects associated with data table""" data_table_stat = m.DataTableStat.objects.get_or_create(data_table = data_table_object)[0] # assign curating users by looking at history concepts assoc with table robot_user = m.get_robot_user() user_list = data_table_object.get_curating_users() if robot_user in user_list: user_list.remove(robot_user) existing_users = data_table_stat.curating_users.all() for u in user_list: if u in existing_users: continue else: data_table_stat.curating_users.add(u) # assign last curated on by looking at curating users curation times and getting most recent concept_maps = data_table_object.get_concept_maps() if len(concept_maps) == 0: return curated_on_dates = [] for cm in concept_maps: curated_on = cm.history.latest().history_date curated_on_dates.append(curated_on) curated_on = max(curated_on_dates) # update last curated on if different if data_table_stat.last_curated_on is not curated_on: data_table_stat.last_curated_on = curated_on # count number of unique ncms, ecms, nedms associated with table data_table_stat.num_ecms = m.EphysProp.objects.filter(ephysconceptmap__source__data_table = data_table_object).distinct().count() data_table_stat.num_ncms = m.Neuron.objects.filter(neuronconceptmap__source__data_table = data_table_object).distinct().count() data_table_stat.num_nedms = m.NeuronEphysDataMap.objects.filter(source__data_table = data_table_object).distinct().count() # define times validated here as min num of times validated per neuron concept map concept_maps = data_table_object.get_neuron_concept_maps() times_validated_per_neuron = [] for cm in concept_maps: tv = cm.times_validated times_validated_per_neuron.append(tv) if len(times_validated_per_neuron) > 0: data_table_stat.times_validated = int(min(times_validated_per_neuron)) data_table_stat.save() return data_table_stat
def count_metadata_assign_accuracy(): articles = m.Article.objects.filter(datatable__datasource__neuronconceptmap__times_validated__gte = 1, articlefulltext__articlefulltextstat__methods_tag_found = True) robot_user = m.get_robot_user() metadata_keys = ['Species', 'Strain', 'ElectrodeType', 'PrepType', 'JxnPotential', 'RecTemp', 'AnimalAge'] stat_dict = {} for metadata_key in metadata_keys: temp_dict = {} values_all = m.ArticleMetaDataMap.objects.filter(metadata__name = metadata_key,article__in=articles).distinct() values_robot = m.ArticleMetaDataMap.objects.filter(metadata__name = metadata_key, article__in = articles, added_by = robot_user).distinct() temp_dict['values_all'] = values_all.count() temp_dict['values_robot'] = values_robot.count() print metadata_key print temp_dict stat_dict[metadata_key] = temp_dict return stat_dict
def update_concept_maps(): ncm_fields, ecm_fields, nedm_fields = load() datatables = m.DataTable.objects.all() print 'Getting or creating data sources' for i,x in enumerate(datatables): prog(i,datatables.count()) m.DataSource.objects.get_or_create(data_table=x) anon_user = m.get_anon_user() robot_user = m.get_robot_user() print 'Updating nedm fields' for i,nedm_field in enumerate(nedm_fields): prog(i, len(nedm_fields)) nedm=m.NeuronEphysDataMap.objects.get(pk=nedm_field['pk']) data_source = m.DataSource.objects.get(data_table=nedm_field['fields']['data_table']) nedm.source = data_source # if nedm.added_by_old == 'human': # nedm.added_by = anon_user # else: # nedm.added_by = robot_user nedm.save() print 'Updating ncm fields' for i,ncm_field in enumerate(ncm_fields): prog(i, len(ncm_fields)) ncm=m.NeuronConceptMap.objects.get(pk=ncm_field['pk']) data_source = m.DataSource.objects.get(data_table=ncm_field['fields']['data_table']) ncm.source = data_source # if ncm.added_by_old == 'human': # ncm.added_by = anon_user # else: # ncm.added_by = robot_user ncm.save() print 'Updating ecm fields' for ecm_field in ecm_fields: prog(i, len(ecm_fields)) ecm=m.EphysConceptMap.objects.get(pk=ecm_field['pk']) data_source = m.DataSource.objects.get(data_table=ecm_field['fields']['data_table']) ecm.source = data_source # if ecm.added_by_old == 'human': # ecm.added_by = anon_user # else: # ecm.added_by = robot_user ecm.save()
def assocDataTableEphysVal(dataTableOb): """Associates a data table object with ephys concept map objects """ dt = dataTableOb ds = m.DataSource.objects.get(data_table = dt) robot_user = m.get_robot_user() if dt.table_text is None: return ephysSyns = m.EphysPropSyn.objects.all() ephysSynList = [e.term.lower() for e in ephysSyns] tableTag = dt.table_html soup = BeautifulSoup(''.join(tableTag), 'lxml') headerTags = soup.findAll('th') tdTags = soup.findAll('td') allTags = headerTags + tdTags for tag in allTags: origTagText = tag.get_text() tagText = origTagText.strip() if 'id' in tag.attrs.keys(): tag_id = str(tag['id']) else: tag_id = -1 if len(tagText) == 0: continue if has_ascii_letters(tagText) is True: # SJT Note - Currently doesn't mine terms in synapse stop words list matched_ephys_ob = match_ephys_header(tagText, ephysSynList) identified_unit = get_units_from_table_header(tagText) if matched_ephys_ob: save_ref_text = origTagText[0:min(len(origTagText),199)] # create EphysConceptMap object ephysConceptMapOb = m.EphysConceptMap.objects.get_or_create(ref_text = save_ref_text, ephys_prop = matched_ephys_ob, source = ds, dt_id = tag_id, #match_quality = matchVal, changed_by = robot_user, times_validated = 0, identified_unit=identified_unit)[0]
def update_other_defined_ecms(): """Updates ephys prop assigned to previously defined ecm's tagged as 'other'""" other_ephys_prop = m.EphysProp.objects.get(name = 'other') ecm_list = m.EphysConceptMap.objects.filter(ephys_prop = other_ephys_prop) ephysSyns = m.EphysPropSyn.objects.all() ephysSynList = [e.term.lower() for e in ephysSyns] for ecm in ecm_list: # get the closest matching ephys prop given the table header reference text matched_ephys_prop = article_text_mining.mine_ephys_prop_in_table.match_ephys_header(ecm.ref_text, ephysSynList) if matched_ephys_prop is None: # no ephys props matched continue if matched_ephys_prop != ecm.ephys_prop: # different found prop than existing one print 'changing %s, to prop: %s, from prop: %s' %(ecm.ref_text, matched_ephys_prop, ecm.ephys_prop) ecm.ephys_prop = matched_ephys_prop # update the ecm ecm.changed_by = m.get_robot_user() ecm.save()
def record_solution_concs(): # articles = m.Article.objects.all() articles = m.Article.objects.filter(Q(datatable__datasource__neuronconceptmap__times_validated__gte = 1, datatable__datasource__neuronephysdatamap__isnull = False) | Q(usersubmission__datasource__neuronconceptmap__times_validated__gte = 1, usersubmission__datasource__neuronephysdatamap__isnull = False)).distinct() robot_user = m.get_robot_user() solution_names = {"external": 'ExternalSolution', "internal": 'InternalSolution'} len_articles = articles.count() for i, article in enumerate(articles): prog(i, len_articles) for soln, soln_name in solution_names.iteritems(): solution_ob = m.ArticleMetaDataMap.objects.filter(article = article, metadata__name = soln_name) if solution_ob and solution_ob[0].ref_text: record_compounds(article, None, solution_ob[0].ref_text.text, ["", "", "", ""], "%s_0" % soln, robot_user)
def assign_robot(): nams = m.NeuronArticleMap.objects.all() u = m.get_robot_user() for nam in nams: nam.added_by = u nam.save()
tot_count = artObs.count() #numRes = 23411881#res.count() print '%d num total articles' % tot_count blockSize = 100 firstInd = 0 lastInd = blockSize blockCnt = 0 while firstInd < lastInd: print '%d of %d blocks ' % (blockCnt, tot_count/blockSize) for artOb in artObs[firstInd:lastInd].iterator(): assocArticleNeuron(artOb) firstInd = lastInd + 1 lastInd = min(lastInd+blockSize, tot_count) blockCnt += 1 robot_user = m.get_robot_user() def assocArticleNeuron(artOb): fullTextOb = artOb.articlefulltext_set.all()[0] fullTextHtml = fullTextOb.get_content() if fullTextHtml == 'test': return soup = bs(''.join(fullTextHtml)) full_text = soup.get_text() neuronTuple = findNeuronsInText(full_text) usedNeurons = [] for t in neuronTuple: neuronOb = t[0] numMentions = t[2] if neuronOb not in usedNeurons and numMentions > 2: #neuronSynOb = t[1] neuronArticleMapOb = m.NeuronArticleMap.objects.get_or_create(neuron = neuronOb,
def assocDataTableEphysVal(dataTableOb): dt = dataTableOb ds = m.DataSource.objects.get(data_table = dt) robot_user = m.get_robot_user() if dt.table_text is None: return tableTag = dt.table_html soup = BeautifulSoup(''.join(tableTag)) headerTags = soup.findAll('th') #print headerTags tdTags = soup.findAll('td') allTags = headerTags + tdTags for tag in allTags: origTagText = tag.get_text() tagText = origTagText.strip() if 'id' in tag.attrs.keys(): tag_id = str(tag['id']) else: tag_id = -1 if len(tagText) == 0: continue if isHeader(tagText) is True: normHeader = resolveHeader(tagText) if len(normHeader) == 0: continue elif normHeader in ephysSynList: # try to match exactly bestMatch = normHeader matchVal = 100 else: #try to fuzzy match try: processOut = process.extractOne(normHeader, ephysSynList) if processOut is not None: bestMatch, matchVal = processOut else: continue except ZeroDivisionError: continue if matchVal > matchThresh: ephysSynOb = m.EphysPropSyn.objects.get(term = bestMatch) ephysPropQuerySet = m.EphysProp.objects.filter(synonyms = ephysSynOb) if ephysPropQuerySet.count() > 0: ephysPropOb = ephysPropQuerySet[0] else: continue # further check that if either header or syn is really short, # match needs to be really f*****g good if len(normHeader) <= shortLim or len(ephysSynOb.term) <= shortLim: if matchVal < matchThreshShort: continue # create EphysConceptMap object save_ref_text = origTagText[0:min(len(origTagText),199)] #print save_ref_text.encode("iso-8859-15", "replace") #print ephysPropOb.name # print ephysSynOb.term #print matchVal ephysConceptMapOb = m.EphysConceptMap.objects.get_or_create(ref_text = save_ref_text, ephys_prop = ephysPropOb, source = ds, dt_id = tag_id, match_quality = matchVal, added_by = robot_user, times_validated = 0)[0]