Exemplo n.º 1
0
    def remove_percolators(self, dict_name):
        """
        Remove percolators. Good for re-adding if they're screwed up somehow.

        Args:
            dict_name (string): The dictionary name to lookup

        Returns: 0
        """
        # get dict_id + terms
        # Make sure the dictionary exists
        self.pg_cursor.execute("""
          SELECT dict_id FROM dictionaries WHERE name ilike %(name)s;
          """, {"name" : dict_name})
        dict_id = self.pg_cursor.fetchone()['dict_id']
        self.pg_cursor.execute("""
            SELECT term FROM dict_terms
          WHERE dict_id = %(dict_id)s;
          """, {"dict_id" : dict_id})
        temp_terms = self.pg_cursor.fetchall()
        terms = [i['term'] for i in temp_terms]

        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        for term in terms:
            print "Removing %s" % term
            es_helper.remove_percolator(dict_id, term)

        return 0
Exemplo n.º 2
0
    def remove(self, dict_name):
        """
        Remove a dictionary and all associated entities (terms matched, percolator, subsets)

        Args:
            dict_name (string): Name of dictionary to remove
        Returns:
            0
        """
        # get dict_id + terms
        # Make sure the dictionary exists
        self.pg_cursor.execute("""
          SELECT dict_id FROM dictionaries WHERE name ilike %(name)s;
          """, {"name" : dict_name})
        check = self.pg_cursor.fetchone()
        if check is None:
            return 1
        else:
            dict_id = check['dict_id']
        self.pg_cursor.execute("""
            SELECT term FROM dict_terms
          WHERE dict_id = %(dict_id)s;
          """, {"dict_id" : dict_id})
        temp_terms = self.pg_cursor.fetchall()
        terms = [i['term'] for i in temp_terms]

        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        for term in terms:
            es_helper.remove_percolator(dict_id, term)
            # remove from dict_terms
            self.delete_term(dict_id, term)

        # delete dict_subset
        for nlp_type in NLP_TYPES:
            self.pg_cursor.execute("""
              DROP TABLE IF EXISTS dict_subsets.%(table_name)s;

              DELETE FROM dict_subsets.meta
              WHERE dict_id = %(dict_id)s
              AND subset_name = %(dict_name)s;

            """, {
              "table_name": AsIs(dict_name + "_" + nlp_type),
              "dict_id": dict_id,
              "dict_name": dict_name
            })

        # remove from dictionaries
        self.pg_cursor.execute("""
          DELETE FROM dictionaries WHERE name = %(dict_name)s
        """, {
          "dict_name": dict_name
        })

        self.pg_connection.commit()

        return 0
Exemplo n.º 3
0
    def compare_stored_live(self, term, dict_id, field = "contents", fix = False):
        """
        Compare the count of matches in the stored postgres tables
        to the count of matching documents from a live Elasticsearch query.

        If all percolators are working as expected, there should be no difference.

        Args:
            term (string): Term to compare
            dict_id (int): The primary ID of the dictionary to check
            field (string): Elasticsearch field to query (e.g. contents or contents.case_sensitive)
            fix (bool): If true, re-match the term to get stored and live searching synchronized.

        Returns: tuple of (number of stored documents with match, number of live documents that matches)
        """
        n_docs_stored = 0
        n_docs_live = 0
        resp = requests.get("http://deepdivesubmit.chtc.wisc.edu/api/terms?term=%s" % term)
        if resp.status_code == 200:
            try:
                if resp.json()["success"]["data"][0] is not None:
                    n_docs_stored = resp.json()["success"]["data"][0]["n_docs"]
            except TypeError:
                print "Error in query!"
                return (0, 0)

        inner = {field : term}

        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        query = {
                "query" :
                    {"bool" :
                        {"must" : {"match_phrase" : inner}}
                    }
                }
        esresp = es_helper.es.search(index=self.config.get("es_index", self.corpus), body=query)
        n_docs_live = esresp["hits"]["total"]
        if fix:
            self.match_term_to_docs(term, field, dict_id)
        del es_helper
        return (n_docs_stored, n_docs_live)
Exemplo n.º 4
0
    def match_term_to_docs(self, term, field, dict_id = None, classification=[], from_date=None):
        """
        Use Elasticsearch to match a term to docids and insert into `terms_docs`
        If classification!=[], two searches will be run. The (docid, term) row
        in the `terms_docs` table will indicate whether the additional terms
        were found in the document

        Args:
            term (string): a term from a dictionary
            field (strimg): The contents field to use (e.g. contents.case_sensitive_word_stemming)
            dict_id (int): The primary ID of the dictionary to match against
            classification (list): list of additional terms that should be searched in the document
        """
        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))
        # match twice -- once with hierarchy, once without.
        if from_date is None:
            doc_matches_no_classification = es_helper.search_new_term(term, field, [], size=200)
        else:
            doc_matches_no_classification = es_helper.search_term_from_date(term, from_date, field, [], size=200)
        if classification != []: # if there are addtl terms to consider, run another search
            if from_date is None:
                doc_matches = es_helper.search_new_term(term, field, classification, size=200)
            else:
                doc_matches = es_helper.search_term_from_date(term, from_date, field, classification, size=200)
        # new bulk insert works easiest with a list of dicts
        to_insert = [
            {
                "term" : term,
                "docid" : doc_id,
                "hits" : hits,
                "hierarchy_present" : None if classification == [] else (True if doc_id in doc_matches.keys() else False),
                "dict_id" : dict_id,
                "from_percolation" : False,
                "last_updated" : datetime.now()
            } for doc_id, hits in doc_matches_no_classification.items()]
        self.insert_into_terms_docs_bulk(to_insert)
Exemplo n.º 5
0
    def ingest(self, dict_name, update = False):
        """
        Create/update the terms for a given dictionary source.

        Args:
            dict_name (string): the name of a dictionary to update
            update (bool): If True, remove the term and re-add it, to ensure
            matches against the most recent documents.
        Effects:
            Updates `dict_terms`, Elasticsearch percolators, and matches new terms to documents
        """
        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        # Fetch metadata about this dictionary
        dictionary_meta = self.get_dictionary_metadata(dict_name)

        if dictionary_meta is None:
            print "Dictionary ", dict_name, " not found"
            sys.exit(1)

        # Check if source is remote or local and load terms
        if "http" in dictionary_meta["source"]:
            source = requests.get(dictionary_meta["source"], verify=False)
            try:
                terms = source.json()
            except ValueError: # API is defined remotely, but isn't json -- likely a CSV file
                terms = []
                for line in source.text.split("\n"):
                    if line == "": continue
                    line = line.split(",")
		    terms.append([i.strip() for i in line])
                    dictionary_meta["csv"] = True
        else:
            with open(dictionary_meta["source"]) as source:
                if dictionary_meta["key"] is None: # either CSV or a one-term-per-line file
                    terms= []
                    for line in source:
                        if line == "": continue
                        line = line.split(",")
                        terms.append([i.strip() for i in line])
                        dictionary_meta["csv"] = True
                else:
                    terms = json.loads(source.read())

        # Slide down the JSON to get the list of objects
        if dictionary_meta["json_path"]:
          for key in dictionary_meta["json_path"]:
              terms = terms[key]

        # Get existing terms
        self.pg_cursor.execute("""
          SELECT term
          FROM dict_terms
          WHERE dict_id = %(dict_id)s
        """, {
          "dict_id": dictionary_meta["dict_id"]
        })

        # IAR - 14.Apr.2017 -- make sure the encoding is the same on the term comparisions
        ## we were comparing unicode to ascii encoding and re-adding terms we already have.
        existing = set( [ unicode(each["term"].strip(), 'utf-8') for each in self.pg_cursor.fetchall() ] )
        if "csv" in dictionary_meta and dictionary_meta["csv"]:
            incoming = set( [ term[0].strip() for term in terms] )
        else:
            incoming = set( [ unicode(term.strip(), 'utf-8') if dictionary_meta["key"] is None else term[dictionary_meta["key"]] for term in terms ] )

        for term in list(existing - incoming):
          # Delete from `dict_terms`
          self.delete_term(dictionary_meta["dict_id"], term)
          # Remove percolator
          es_helper.remove_percolator(dictionary_meta["dict_id"], term)

        # Find terms to insert
        to_insert = list(incoming - existing)
        if update:
            to_insert = list(incoming)
        print to_insert

        # Iterate on the list of terms
        for idx, term in enumerate(terms):
            # Store hierarchy information about the term, if applicable
            classification = []

            # If list of terms, just grab the term
            if dictionary_meta["key"] is None: # just a term -- assume no classification/hierarchy
                if dictionary_meta["csv"]:
                    if len(term) > 1:
                        classification = [i for i in term[1:] if i != '']
                    term = term[0]

                if update:
                    print "Updating term %s" % term
                    # Delete from `dict_terms`
                    self.delete_term(dictionary_meta["dict_id"], term.strip())
                    # Remove percolator
                    es_helper.remove_percolator(dictionary_meta["dict_id"], term)

                to_cache = term.strip()
                # Log progress
                sys.stdout.write("Working on " + term + "(term " + str(idx) + " of " + str(len(terms)) + ")\n")
                sys.stdout.flush()

            # Otherwise follow the given path
            else:
                if update:
                    print "Updating term %s" % term
                    # Delete from `dict_terms`
                    self.delete_term(dictionary_meta["dict_id"], term[dictionary_meta["key"]].strip())
                    # Remove percolator
                    es_helper.remove_percolator(dictionary_meta["dict_id"], term[dictionary_meta["key"]])

                print "%s should be gone" % term
                # Log progress
		sys.stdout.write(u"Working on ".encode('utf-8') + term[dictionary_meta["key"]].encode("utf-8") + "(term " + str(idx) + " of " + str(len(terms)) + ")\n")
		sys.stdout.flush()

                try:
                    to_cache = term[dictionary_meta["key"]]

                    if dictionary_meta["classification_path"] is not None:
                        for rank in dictionary_meta["classification_path"]:
                            hierarchy_term = term
                            rank = rank.split('.')
                            for field in rank:
                                if field in hierarchy_term:
                                    hierarchy_term = hierarchy_term[field]
                                else:
                                    hierarchy_term = None
                                    continue
                            if hierarchy_term is not None:
                                classification.append(hierarchy_term)

                except ValueError:
                    to_cache = None

            # Make sure a value was returned
            if to_cache is None:
                print "Cannot extract term from list"
                sys.exit(1)


            # If it is a new term, insert into `dict_terms`, create a percolator, and index
            if to_cache in to_insert:
                # check if the term exists as-is in the table, before inserting/matching
                ## note: if a term has multiple classifications, it'll still be inserted multiple times
                self.pg_cursor.execute("""
                  SELECT term
                  FROM dict_terms
                  WHERE dict_id = %(dict_id)s
                    AND term = %(term)s
                    AND classification = %(classification)s;
                """, {
                  "dict_id": dictionary_meta["dict_id"],
                  "term" : to_cache,
                  "classification" : classification
                })
                if self.pg_cursor.fetchone() is not None:
                    print "Term + classification already in table! Skipping!"
                    continue
                else:
                    # Insert into `dict_terms`
                    self.insert_into_dict_terms(dictionary_meta["dict_id"], to_cache, classification)

                field = self.get_contents_field(dict_name)

                # Add percolator
                es_helper.add_percolator(dictionary_meta["dict_id"], to_cache, field, classification)

                # Update `terms_docs` self.match_term_to_docs(to_cache, dictionary_meta["case_sensitive"], classification)
#                self.match_term_to_docs(to_cache, field, dictionary_meta["dict_id"], classification)

        # Update the last updated time for this dictionary
        self.pg_cursor.execute("""
          UPDATE dictionaries SET last_updated = now() WHERE dict_id = %(dict_id)s
        """, {
          "dict_id": dictionary_meta["dict_id"]
        })
        self.pg_connection.commit()

        print "Done ingesting ", dict_name
Exemplo n.º 6
0
    def update(self, dict_name):
        """
        Update the terms for a given dictionary source. Like ingest, but for updating.
        TODO: decide if this should just go ahead and become ingest. It's essentially
        the same but I'd rather write code than think about it as of 31.May.2018 at 3:03pm

        Args:
            dict_name (string): the name of a dictionary to update
            matches against the most recent documents.
        Effects:
            Updates `dict_terms`, Elasticsearch percolators, and matches new terms to documents
        """
        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        # Fetch metadata about this dictionary
        dictionary_meta = self.get_dictionary_metadata(dict_name)

        if dictionary_meta is None:
            print "Dictionary ", dict_name, " not found"
            sys.exit(1)

        # Check if source is remote or local and load terms
        if "http" in dictionary_meta["source"]:
            source = requests.get(dictionary_meta["source"], verify=False)
            try:
                terms = source.json()
            except ValueError: # API is defined remotely, but isn't json -- likely a CSV file
                terms = []
                for line in source.text.split("\n"):
                    if line == "": continue
                    line = line.split(",")
		    terms.append([i.strip() for i in line])
                    dictionary_meta["csv"] = True
        else:
            with open(dictionary_meta["source"]) as source:
                if dictionary_meta["key"] is None: # either CSV or a one-term-per-line file
                    terms= []
                    for line in source:
                        if line == "": continue
                        line = line.split(",")
                        terms.append([i.strip() for i in line])
                        dictionary_meta["csv"] = True
                else:
                    terms = json.loads(source.read())

        # Slide down the JSON to get the list of objects
        if dictionary_meta["json_path"]:
          for key in dictionary_meta["json_path"]:
              terms = terms[key]

        # Get existing terms
        self.pg_cursor.execute("""
          SELECT term, classification
          FROM dict_terms
          WHERE dict_id = %(dict_id)s
        """, {
          "dict_id": dictionary_meta["dict_id"]
        })

        existing = set( [ (unicode(each["term"].strip(), 'utf-8'), tuple(each["classification"])) for each in self.pg_cursor.fetchall() ] )
        incoming = set()

        for idx, term in enumerate(terms): # build the set of (term, classification) tuples
            classification = []

            if dictionary_meta["key"] is None: # just a term -- assume no classification/hierarchy
                if dictionary_meta["csv"]:
                    if len(term) > 1:
                        classification = [i for i in term[1:] if i != '']
                    term = term[0]
                to_cache = term.strip()

            # Otherwise follow the given path
            else:
                if dictionary_meta["classification_path"] is not None:
                    for rank in dictionary_meta["classification_path"]:
                        hierarchy_term = term
                        rank = rank.split('.')
                        for field in rank:
                            if field in hierarchy_term:
                                hierarchy_term = hierarchy_term[field]
                            else:
                                hierarchy_term = None
                                continue
                        if hierarchy_term is not None:
                            classification.append(hierarchy_term)
		term = term[dictionary_meta["key"]]

            incoming.add((term, tuple(classification)))

        # remove terms that are no longer around
        for term, classification in list(existing - incoming):
            self.delete_term(dictionary_meta["dict_id"], term, list(classification))
            es_helper.remove_percolator(dictionary_meta["dict_id"], term)

        for term, classification in incoming:
	    print "Working on %s" % term
            classification = list(classification)

            field = self.get_contents_field(dict_name)

            # check if the term exists as-is in the table, before inserting/matching
            ## note: if a term has multiple classifications, it'll still be inserted multiple times
            print self.pg_cursor.execute("""
              SELECT last_updated
              FROM dict_terms
              WHERE dict_id = %(dict_id)s
                AND term = %(term)s
                AND classification = %(classification)s;
            """, {
              "dict_id": dictionary_meta["dict_id"],
              "term" : term,
              "classification" : classification
            })
            self.pg_cursor.execute("""
              SELECT last_updated
              FROM dict_terms
              WHERE dict_id = %(dict_id)s
                AND term = %(term)s
                AND classification = %(classification)s;
            """, {
              "dict_id": dictionary_meta["dict_id"],
              "term" : term,
              "classification" : classification
            })
            date_from = self.pg_cursor.fetchone()
            if date_from is None:
                # Insert into `dict_terms`
                self.insert_into_dict_terms(dictionary_meta["dict_id"], term, classification)
		# Add percolator
		es_helper.add_percolator(dictionary_meta["dict_id"], term, field, classification)
	    else:
		date_from = date_from["last_updated"]


            # Update `terms_docs` self.match_term_to_docs(term, dictionary_meta["case_sensitive"], classification)
#            self.match_term_to_docs(term, field, dictionary_meta["dict_id"], classification, date_from)

            # Update the last updated time in dict_terms
#            self.pg_cursor.execute("""
#              UPDATE dict_terms SET last_updated = now() WHERE dict_id = %(dict_id)s AND term = %(term)s
#            """, {
#              "dict_id": dictionary_meta["dict_id"],
#              "term" : term
#            })
#            self.pg_connection.commit()

        # Update the last updated time for this dictionary
        self.pg_cursor.execute("""
          UPDATE dictionaries SET last_updated = now() WHERE dict_id = %(dict_id)s
        """, {
          "dict_id": dictionary_meta["dict_id"]
        })
        self.pg_connection.commit()

        print "Done updating ", dict_name
Exemplo n.º 7
0
    def health_check(self, dict_name=None, fix=False):
        """
        Do a health check against the passed dictionary. If no dictionary is
        specified, run a health check against all of them.  The health check
        includes:
                * Ensure percolators exist + work
                * Ensure all terms with matches have proper entries in terms_docs
                * Check latest match/subset date (and encourage rerunning if the data looks stale)

        Args:
            dict_name (string): Name of dictionary to check. If None, check all dictionaries.
            fix (bool): If True, it will attempt to fix the errors.

        Returns: True if it looks healthy, otherwise False

        """
        if dict_name is not None:
            dicts = [dict_name]
        else:
            self.pg_cursor.execute("""SELECT name FROM dictionaries;""")
            dicts = [i["name"] for i in self.pg_cursor.fetchall()]

        es_helper = ElasticsearchHelper.ElasticsearchHelper(self.config.get("es_index", self.corpus))

        healthy = True
        for dictionary in dicts:
            dictionary_meta = self.get_dictionary_metadata(dictionary)

            if dictionary_meta is None:
                print "Dictionary ", dict_name, " not found"
                sys.exit(1)

            print "--- %s ---" % dictionary
            self.pg_cursor.execute("""SELECT term
                     FROM dict_terms
                       WHERE term NOT IN (
                                   SELECT distinct term from terms_docs
                                            )
                          AND dict_id = (SELECT dict_id FROM dictionaries WHERE name ilike %(name)s)
                              ORDER BY term;""", {"name" : dict_name})
            # get terms
            terms = [i["term"] for i in self.pg_cursor.fetchall()]

            field = self.get_contents_field(dict_name)
            print "Checking percolators..."
            for term in terms:
                # check for percolator existence
                check = es_helper.does_percolator_exist(dictionary_meta["dict_id"], term)
                if check:
                    continue
                else:
                    healthy = False
                    print "Missing percolator! Term: %s" % term
                    if fix:
                        es_helper.add_percolator(dictionary_meta["dict_id"], term, field)
                # check if percolator works?

            print "Checking match counts for a random subsample..."
            # check random sample of terms
            self.pg_cursor.execute("""SELECT term
                     FROM dict_terms
                          WHERE dict_id = (SELECT dict_id FROM dictionaries WHERE name ilike %(name)s)
                          AND random() < 0.1 LIMIT 1000;""", {"name" : dict_name})
            terms_sample = [i["term"] for i in self.pg_cursor.fetchall()]
            mismatched_terms = 0
            for term in terms_sample:
                stored_docs = self.pg_cursor.execute("""SELECT DISTINCT(docid) FROM terms_docs WHERE term=%(term)s;""" ,{"term": term})
                stored_docs = [i['docid'] for i in self.pg_cursor]
                # do 'live' check
                current_matches = es_helper.search_new_term(term, field)
                if len(current_matches) != len(stored_docs):
                    mismatched_terms+=1
                    healthy = False
                    print "Document-level matches missing from term %s! (stored: %s, current: %s)" % (term, len(stored_docs), len(current_matches))
                if fix:
                    self.match_term_to_docs(term, field)
            print "%s of %s checked terms had disagreement in stored vs current matches." % (mismatched_terms, len(terms_sample))

        return healthy
Exemplo n.º 8
0
#!/usr/bin/env python
# encoding: utf-8

import GddConfig
import ElasticsearchHelper
import elasticsearch
import time
import glob
import os

esh = ElasticsearchHelper.ElasticsearchHelper(es_index="temp_articles",
                                              es_type="article")


def match_term_to_docs(term,
                       field,
                       dict_id=None,
                       classification=[],
                       from_date=None):
    """
    Use Elasticsearch to match a term to docids and insert into `terms_docs`
    If classification!=[], two searches will be run. The (docid, term) row
    in the `terms_docs` table will indicate whether the additional terms
    were found in the document

    Args:
        term (string): a term from a dictionary
        field (strimg): The contents field to use (e.g. contents.case_sensitive_word_stemming)
        dict_id (int): The primary ID of the dictionary to match against
        classification (list): list of additional terms that should be searched in the document
    """