Пример #1
0
    def delete (self, context):
        cursor = context.get_cursor()

        try:
            # First, remove the study (connected table records will
            # also be deleted).  But don't delete non-existent studies.
            if self.study_id >= 0:
                study = Study(context, self.study_id)
                study.delete(context)
            
            # Next, unindex it.
            search_index = canary.search.SearchIndex(context)
            search_index.unindex_record(self)

            # Then, remove the metadata
            cursor.execute("""
                DELETE FROM queued_record_metadata
                WHERE queued_record_id = %s
                """, self.uid)
            
            # Finally, remove this record itself.
            cursor.execute("""
                DELETE FROM queued_records
                WHERE uid = %s
                """, self.uid)

            if context.config.use_cache:
                context.cache_delete('record:%s' % self.uid)
        except Exception, e:
            context.logger.error('Delete queued record %s (%s)', self.uid, e)
Пример #2
0
    def delete(self, context):
        cursor = context.get_cursor()

        try:
            # First, remove the study (connected table records will
            # also be deleted).  But don't delete non-existent studies.
            if self.study_id >= 0:
                study = Study(context, self.study_id)
                study.delete(context)

            # Next, unindex it.
            search_index = canary.search.SearchIndex(context)
            search_index.unindex_record(self)

            # Then, remove the metadata
            cursor.execute(
                """
                DELETE FROM queued_record_metadata
                WHERE queued_record_id = %s
                """, self.uid)

            # Finally, remove this record itself.
            cursor.execute(
                """
                DELETE FROM queued_records
                WHERE uid = %s
                """, self.uid)

            if context.config.use_cache:
                context.cache_delete('record:%s' % self.uid)
        except Exception, e:
            context.logger.error('Delete queued record %s (%s)', self.uid, e)
Пример #3
0
 def search(self, query):
     """perform a lucene search for studies
     """
     studies = []
     for record in self.searcher.search(query, curated_only=True):
         studies.append(Study(self.context, record.study_id))
     return studies
Пример #4
0
    query_str = options.boolean.join(
        [' "%s" [%s] ' % (term, options.field) for term in args])
    #print query_str.strip()

    search_index = SearchIndex(context)
    hit_list = []
    hits, searcher = search_index.search(query_str)
    for i, doc in hits:
        hit_list.append(doc.get('uid'))
    searcher.close()

    output = []
    for id in hit_list:
        rec = QueuedRecord(context, int(id))
        if options.locations:
            study = Study(context, rec.study_id)
            for loc in study.locations:
                out = []
                out.extend((id, loc.uid, loc.study_id, loc.feature_id))
                feature = Feature(uid=loc.feature_id)
                feature.load(context)
                out.extend((feature.latitude, feature.longitude, feature.name,
                            feature.feature_type, feature.country_code))
                output.append('\t'.join([str(v) for v in out]))
        else:
            mm = rec.get_mapped_metadata(ctm)
            if mm['author']:
                first_author = mm['author'][0]
            else:
                first_author = '-'
            output.append('\t'.join(
Пример #5
0
#!/usr/bin/env python

# $Id$

from canary.context import Context
from canary.loader import QueuedRecord
from canary.study import Study

context = Context()
cursor = context.get_cursor()
cursor.execute("""
    SELECT studies.record_id, study_history.study_id, study_history.modified
    FROM study_history, studies
    WHERE study_history.message = 'Set record status to curated'
    AND study_history.study_id = studies.uid
    ORDER BY studies.record_id ASC, study_history.modified ASC
    """)

for row in cursor:
    print "record_id:%i; study_id:%i; curated:%s" % row
    # update either QUEUED_RECORDS or STUDIES with curated time
    # r = QueuedRecord(con,row[0])
    # r.date_curated = row[2]
    # r.save
    s = Study(context, row[1])
    s.date_curated = row[2]
    s.save(context)
Пример #6
0
#!/usr/bin/env python

# $Id$


from canary.context import Context
from canary.loader import QueuedRecord
from canary.study import Study

context = Context()
cursor = context.get_cursor()
cursor.execute("""
    SELECT studies.record_id, study_history.study_id, study_history.modified
    FROM study_history, studies
    WHERE study_history.message = 'Set record status to curated'
    AND study_history.study_id = studies.uid
    ORDER BY studies.record_id ASC, study_history.modified ASC
    """)

for row in cursor:
    print "record_id:%i; study_id:%i; curated:%s" % row
    # update either QUEUED_RECORDS or STUDIES with curated time
    # r = QueuedRecord(con,row[0])
    # r.date_curated = row[2]
    # r.save
    s = Study(context, row[1])
    s.date_curated = row[2]
    s.save(context)

Пример #7
0
            pmid_list.sort()

        for pmid in pmid_list:
            cursor.execute("""
                SELECT studies.uid AS study_id, queued_records.uid AS queued_record_id
                FROM queued_records, studies
                WHERE queued_records.study_id = studies.uid
                AND studies.status = 2
                AND queued_records.unique_identifier = %s
                """, pmid)
            fields = [d[0] for d in cursor.description]
            desc = dtuple.TupleDescriptor([[f] for f in fields])
            row = cursor.fetchone()
            if row:
                row = dtuple.DatabaseTuple(desc, row)
                study = Study(row['study_id'])
                study.load(cursor)
                studies.append((study, pmid))
    
    else:
        # sort by unique_identifier for easier side-by-side comparison
        cursor.execute("""
            SELECT studies.uid AS study_id, queued_records.uid AS queued_record_id,
                queued_records.unique_identifier
            FROM studies, queued_records
            WHERE studies.uid = queued_records.study_id
            AND studies.status = 2
            AND studies.curator_user_id = %s
            ORDER BY ABS(queued_records.unique_identifier)
            """, curator_id)
        
Пример #8
0
def makeplot(context, token, records):

    this_module = sys.modules[__name__]
    N = len(linkage_keys)
    ind = arange(N)
    width = 0.35

    meth_type_count = {}

    study_count = dict(zip(linkages, array([0] * len(linkages))))
    for id, type in linkages_attrs:
        meth_type_count[id] = start.copy()

    for rec in records:
        study = Study(context, rec.study_id)
        for attr, key in linkages_attrs:
            if getattr(study, 'has_%s' % key):
                study_count[attr] += 1
                for meth in study.methodologies:
                    meth_type_count[attr][meth.study_type_id] += 1

    rc('grid', color=0.85, linestyle='-', linewidth=0.3)
    grid(True)

    yoff = array([0] * N)

    s = """
            'experimental' : 1,
        'descriptive' : 2,
        'aggregate' : 3,
        'cross sectional' : 4,
        'cohort' : 5,
        'case control' : 6,
        'disease model' : 7,
        """

    print['%s %s\n' % (k, v) for k, v in meth_type_count.items()]

    p_exp_x = [meth_type_count[key][1] for key in linkage_keys]
    p_exp = bar(ind, p_exp_x, width, color='#993333', bottom=yoff)
    yoff = yoff + p_exp_x

    p_coh_x = [meth_type_count[key][5] for key in linkage_keys]
    p_coh = bar(ind, p_coh_x, width, color='#FF9933', bottom=yoff)
    yoff = yoff + p_coh_x

    p_csec_x = [meth_type_count[key][4] for key in linkage_keys]
    p_csec = bar(ind, p_csec_x, width, color='#99CC99', bottom=yoff)
    yoff = yoff + p_csec_x

    p_desc_x = [meth_type_count[key][2] for key in linkage_keys]
    p_desc = bar(ind, p_desc_x, width, color='#6666CC', bottom=yoff)
    yoff = yoff + p_desc_x

    #p_agg_x = [meth_type_count[key][3] for key in linkage_keys]
    #p_agg = bar(ind, p_agg_x, width, color='#CCCC00', bottom=yoff)
    #yoff = yoff + p_agg_x

    #p_cc_x = [meth_type_count[key][6] for key in linkage_keys]
    #p_cc = bar(ind, p_cc_x, width, color='#CC66FF', bottom=yoff)
    #yoff = yoff + p_cc_x

    #p_dm_x = [meth_type_count[key][7] for key in linkage_keys]
    #p_dm = bar(ind, p_dm_x, width, color='#993366', bottom=yoff)
    #yoff = yoff + p_dm_x

    precords_x = array([len(records)] * N)
    #precords = bar(ind+width/3, precords_x, width/3, color='#999999', bottom=0)
    precords = plot(precords_x, color='#AAAAAA', marker='-', linewidth=1.5)

    pstudies_x = [study_count[k] for k in linkage_keys]
    pstudies = bar(ind + width / 3,
                   pstudies_x,
                   width / 3,
                   color='#EEEEEE',
                   bottom=0)

    max_val = max(yoff)
    xlabel('Linkages to Human Health')
    ylabel('# Methodologies by Type for Studies with Linkages')
    title('Animal Sentinels for "%s" (%s records)' % (token, len(records)),
          size=12)
    xticks(ind + width / 2, [linkages[k] for k in linkage_keys],
           rotation=20,
           size=6)
    step = max_val / 5
    yticks(arange(0, max_val + (step * 3), step))

    legend(
        (p_exp[0], p_coh[0], p_csec[0], p_desc[0], precords[0], pstudies[0]),
        ('Experimental', 'Cohort', 'Cross-Sectional', 'Descriptive',
         '# Studies Total', '# Records w/Linkage'))
    ##    legend((p_exp[0], p_desc[0], p_agg[0], p_csec[0], p_coh[0],
    ##        p_cc[0], p_dm[0], precords[0], pstudies[0]),
    ##        ('Experimental', 'Descriptive', 'Aggregate',
    ##        'Cross-Sectional', 'Cohort', 'Case-Control', 'Disease Model',

    #savefig(('%s' % token.replace(' ', '_')))
    show()
    cla()
Пример #9
0
    def index_record(self, record, writer=None):
        # field, value, store?, index?, token?
        try:
            if not writer:
                had_writer = False
                writer = self.context.get_search_index_writer(False)
            else:
                had_writer = True

            study = Study(self.context, record.study_id)

            self.logger.debug('starting document')
            doc = PyLucene.Document()

            # First, we need to create a unique key so we can later delete
            # if necessary.  Will try simply uid for now.
            doc.add(PyLucene.Field('uid', str(record.uid), True, True, False))
            doc.add(PyLucene.Field('all', str(record.uid), True, True, False))

            # Second, save internal-use metadata.  These should probably
            # be x'd out at Query-time.
            doc.add(
                PyLucene.Field('record-status', str(record.status), False,
                               True, False))
            doc.add(
                PyLucene.Field('article-type', str(study.article_type), False,
                               True, False))

            source_catalog = self.context.get_source_catalog()
            complete_term_map = source_catalog.get_complete_mapping()
            mapped_metadata = record.get_mapped_metadata(complete_term_map)

            # First index all the non-multiple metadata fields
            for field in ('abstract', 'affiliation', 'issn', 'journal',
                          'pubdate', 'issue', 'pages', 'title', 'volume'):
                val = mapped_metadata.get(field, None)
                if val:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    doc.add(PyLucene.Field('all', val, False, True, True))

            # Be sure to index all of (abbrev, full title, issn) as "journal"
            issn = mapped_metadata.get('issn')
            if issn:
                j = Journal()
                j.load_from_issn(self.context, issn)
                no_dash = j.no_dash()
                self.logger.debug('indexing journal: %s, abbv:%s, issn:%s' % \
                    (j.journal_title, j.abbreviation, issn))
                doc.add(PyLucene.Field('journal', issn, False, True, True))
                doc.add(PyLucene.Field('journal', no_dash, False, True, True))
                doc.add(PyLucene.Field('all', issn, False, True, True))
                doc.add(PyLucene.Field('all', no_dash, False, True, True))
                if j.abbreviation:
                    doc.add(
                        PyLucene.Field('journal', j.abbreviation, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.abbreviation, False, True,
                                       True))
                if j.journal_title:
                    doc.add(
                        PyLucene.Field('journal', j.journal_title, False, True,
                                       True))
                    doc.add(
                        PyLucene.Field('all', j.journal_title, False, True,
                                       True))

            # If a page range is given, index the first page, assuming
            # the delimiter is '-'
            pages = mapped_metadata.get('pages', None)
            if pages \
                and '-' in pages:
                first_page = pages[0:pages.index('-')]
                doc.add(PyLucene.Field('pages', first_page, False, True, True))
                doc.add(PyLucene.Field('all', first_page, False, True, True))

            # 'unique_identifier' must be specially treated because
            # of the '_'
            val = mapped_metadata.get('unique_identifier', None)
            if val:
                doc.add(
                    PyLucene.Field('unique-identifier', val, False, True,
                                   True))
                doc.add(PyLucene.Field('all', val, False, True, True))

            # Next, index all the possibly-multiple metadata fields
            # Give these (especially for author and subject) a little
            # boost, less than for canary UMLS concepts
            for field in ('author', 'grantnum', 'keyword', 'registrynum',
                          'subject'):
                vals = mapped_metadata.get(field, None)
                for val in vals:
                    doc.add(PyLucene.Field(field, val, False, True, True))
                    f = PyLucene.Field('all', val, False, True, True)
                    f.setBoost(1.3)
                    doc.add(f)

            # If at least one author name is available, index the first
            # author to support first-author searching.  Also, boost it
            # slightly higher than the other authors.
            authors = mapped_metadata.get('author', None)
            if authors:
                doc.add(
                    PyLucene.Field('first-author', authors[0], False, True,
                                   True))
                f = PyLucene.Field('all', authors[0], False, True, True)
                f.setBoost(1.5)
                doc.add(f)

            # All the booleans
            for bool in ('has_outcomes', 'has_exposures', 'has_relationships',
                         'has_interspecies', 'has_exposure_linkage',
                         'has_outcome_linkage', 'has_genomic'):
                val = getattr(study, bool)
                # NOTE: I think lucene dislikes '_' in field names ??
                boolstr = bool.replace('_', '-')
                doc.add(
                    PyLucene.Field(boolstr, str(int(val)), False, True, False))
                # NOTE: no need to add this to 'all'.  I think.

            # Now, all the UMLS concepts.  Simpler approach to
            # lucene "synonym injection", but it works!  Give it
            # slightly bigger boost than keywords/subjects
            for ctype in ('exposures', 'outcomes', 'risk_factors', 'species'):
                # NOTE: I think lucene dislikes '_' in field names ??
                ctype_search = ctype.replace('_', '-')
                for val in getattr(study, ctype):
                    concept = Concept(self.context, val.concept_id)
                    for syn in concept.synonyms:
                        doc.add(
                            PyLucene.Field(ctype_search,
                                           unicode(syn, 'latin-1'), False,
                                           True, True))
                        f = PyLucene.Field('all', unicode(syn, 'latin-1'),
                                           False, True, True)
                        f.setBoost(2.0)
                        doc.add(f)

            # And, the locations
            gazeteer = self.context.get_gazeteer()
            locs = []
            for location in study.locations:
                feature = Feature(self.context, uid=location.feature_id)
                feature.load(self.context)
                if gazeteer.fips_codes.has_key(
                    (feature.country_code, feature.adm1)):
                    region_name = gazeteer.fips_codes[(feature.country_code,
                                                       feature.adm1)]
                else:
                    region_name = ''
                full_name = '%s (%s, %s, %s)' % (
                    feature.name, gazeteer.feature_codes[feature.feature_type],
                    render_capitalized(region_name),
                    render_capitalized(
                        gazeteer.country_codes[feature.country_code]))
                doc.add(
                    PyLucene.Field('location', unicode(full_name, 'latin-1'),
                                   False, True, True))
                doc.add(
                    PyLucene.Field('all', unicode(full_name, 'latin-1'), False,
                                   True, True))

            # Finally, the methodologies
            for meth in study.methodologies:
                doc.add(
                    PyLucene.Field('methodology',
                                   meth.get_study_type(text=True), False, True,
                                   True))
                doc.add(
                    PyLucene.Field('all', meth.get_study_type(text=True),
                                   False, True, True))
                # And each exposure route term
                for route in meth.get_routes(True):
                    doc.add(
                        PyLucene.Field('exposure_route', route, False, True,
                                       True))
                    doc.add(PyLucene.Field('all', route, False, True, True))

            writer.addDocument(doc)
            if not had_writer:
                writer.close()
        except Exception, e:
            self.logger.error('Failed to index record: %s', e)
            self.logger.error(traceback.print_exc())
Пример #10
0
if __name__ == '__main__':
    cmdline = CommandLine()
    cmdline.parse_args()
    context = cmdline.context()

    collector = StatCollector(context)
    collector.add_handlers(
        CuratorHandler(),
        ArticleTypeHandler(),
        ExposureHandler(),
        OutcomeHandler(),
        RiskFactorHandler(),
        SpeciesHandler(),
        LocationHandler(),
        MethodologyTypeHandler(),
    )

    searcher = RecordSearcher(context)
    records = searcher.search('record-status:%i' % QueuedRecord.STATUS_CURATED)

    today = datetime.now()
    one_weeks_records = [
        rec for rec in records
        if today - Study(context, rec.study_id).date_modified <= timedelta(7)
    ]
    collector.process(one_weeks_records)

    for handler in collector.handlers:
        print handler.__class__, handler.stats