示例#1
0
 def initialize(self, alias_data):
     from detect.data.response import Response
     self.data_response = Response()
     self.data_response.open_connection()
     self.alias_data = alias_data
     self.param_extractor = ParamExtractor(self)
     self.path_extractor = PathExtractor(self)
     self.entity_factory = EntityFactory(self.alias_data)
     self.brute_detector = Detector(self.alias_data)
示例#2
0
 def __init__(self, alias_data):
     self.POS_to_use = ["J", "N", "V", "R", "I"]
     self.extra_stop_words = ["show"]
     self.stopwords = stopwords.words('english')
     self.stopwords.extend(self.extra_stop_words)
     self.skip_words = [
         "all", "any", "anything", "display", "every", "everything", "find",
         "go", "like", "looking", "nice", "pair", "show", "some",
         "something", "want"
     ]
     self.stemmer = PorterStemmer()
     self.tokenizer = nltk.WordPunctTokenizer()
     self.alias_data = alias_data
     self.entity_factory = EntityFactory(self.alias_data)
示例#3
0
文件: detector.py 项目: rdefeo/detect
 def __init__(self, alias_data):
     self.POS_to_use = ["J", "N", "V", "R", "I"]
     self.extra_stop_words = [
         "show"
     ]
     self.stopwords = stopwords.words('english')
     self.stopwords.extend(self.extra_stop_words)
     self.skip_words = [
         "all",
         "any",
         "anything",
         "display",
         "every",
         "everything",
         "find",
         "go",
         "like",
         "looking",
         "nice",
         "pair",
         "show",
         "some",
         "something",
         "want"
     ]
     self.stemmer = PorterStemmer()
     self.tokenizer = nltk.WordPunctTokenizer()
     self.alias_data = alias_data
     self.entity_factory = EntityFactory(self.alias_data)
示例#4
0
文件: detect.py 项目: rdefeo/detect
    def initialize(self, alias_data):
        from detect.data.response import Response

        self.data_response = Response()
        self.data_response.open_connection()
        self.alias_data = alias_data
        self.param_extractor = ParamExtractor(self)
        self.path_extractor = PathExtractor(self)
        self.entity_factory = EntityFactory(self.alias_data)
        self.brute_detector = Detector(self.alias_data)
示例#5
0
文件: detector.py 项目: rdefeo/detect
class Detector:
    def __init__(self, alias_data):
        self.POS_to_use = ["J", "N", "V", "R", "I"]
        self.extra_stop_words = [
            "show"
        ]
        self.stopwords = stopwords.words('english')
        self.stopwords.extend(self.extra_stop_words)
        self.skip_words = [
            "all",
            "any",
            "anything",
            "display",
            "every",
            "everything",
            "find",
            "go",
            "like",
            "looking",
            "nice",
            "pair",
            "show",
            "some",
            "something",
            "want"
        ]
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.WordPunctTokenizer()
        self.alias_data = alias_data
        self.entity_factory = EntityFactory(self.alias_data)

    def detect_intent(self, preperation_result):
        return {
            "confidence": 80.0,
            "intent": "include"
        }

    def detect(self, q):
        preperation_result = self.preparation(q)
        outcome = self.detect_intent(preperation_result)
        outcome["entities"] = []
        entities = self.detect_entities(self.alias_data, preperation_result)
        for x in entities["detections"]:
            outcome["entities"].append(self.entity_factory.create(x["type"], x["key"], source=x["source"]))

        for x in entities["non_detections"]:
            outcome["entities"].append(self.entity_factory.create("unknown", x, confidence=10.0))

        return [outcome]

    def preparation(self, q):
        # used_query = q.lower().strip()
        used_query = q
        result = {
            "used_query": used_query,
            "original_query": q
        }
        raw_tokens = self.tokenizer.tokenize(used_query)
        token_spans = list(self.tokenizer.span_tokenize(used_query))
        tagged_words = nltk.pos_tag(raw_tokens)

        result["tokens"] = []
        for index, value in enumerate(raw_tokens):
            lower_value = value.lower()
            stop_word = lower_value in self.stopwords
            skip_word = lower_value in self.skip_words
            result["tokens"].append({
                "value": lower_value,
                "start": token_spans[index][0],
                "end": token_spans[index][1],
                "pos": tagged_words[index][1],
                "use": tagged_words[index][1][0] in self.POS_to_use and not stop_word and not skip_word,
                "stem": self.stemmer.stem(lower_value),
                "stop_word": stop_word,
                "skip_word": skip_word
            })

        return result

    def create_found_doc(self, term, tokens, found_item, start, end):
        return {
            "term": term,
            "tokens": tokens,
            "found_item": found_item,
            "start": start,
            "end": end,
            "position": "%s_%s" % (start, end)
        }

    def find_matches(self, ngram_size: int, tokens: list, vocab, can_not_match=None):
        if can_not_match is None:
            can_not_match = []
        res = {
            "found": [],
            "can_not_match": can_not_match
        }
        tokens_to_use = [x for x in tokens if x["use"]]
        n = min(len(tokens_to_use), ngram_size)
        for ngram in ngrams(tokens_to_use, n):
            ngram_term = " ".join(
                x["value"] for x in ngram
            )
            start = ngram[0]["start"]
            end = ngram[-1:][0]["end"]

            if ngram_term in vocab["en"]:
                # must be copied in this way
                key = "%s_%s" % (start, end)
                new_doc = self.create_found_doc(
                    ngram_term,
                    [x["value"] for x in ngram],
                    vocab["en"][ngram_term],
                    start,
                    end
                )
                if not any(y for y in res["found"] if new_doc["position"] == y["position"]):
                    res["found"].append(new_doc)
            elif n > 0:
                new_found_items = self.find_matches(
                    n - 1,
                    tokens,
                    vocab,
                    can_not_match=res["can_not_match"]
                )["found"]
                res["found"].extend(
                    [x for x in new_found_items if not any(y for y in res["found"] if x["position"] == y["position"])]
                )
            elif n == 0 and ngram[0]["use"] and ngram[0]["pos"][0] in self.POS_to_use and ngram[0] not in res[
                "can_not_match"]:
                res["can_not_match"].append(ngram[0])

        flattened_tokens = [x for entity in res["found"] for x in entity["tokens"]]
        res["can_not_match"] = [x for x in res["can_not_match"] if x["value"] not in flattened_tokens]
        return res

    def autocorrect_query(self, used_query, found_entities):
        corrected_query = used_query
        corrected = False
        # need to work from end of string backwards otherwise it gets messed up with adding/removing chars
        for entity in sorted(found_entities, key=itemgetter("start"), reverse=True):
            for x in [x for x in entity["found_item"] if x["match_type"] == "spelling"]:
                corrected = True
                corrected_query = corrected_query[0:entity["start"]] + x["display_name"] + corrected_query[
                                                                                           entity["end"]:]

        if corrected:
            return corrected_query
        else:
            return None

    def key_matches(self, found_entities):
        return [x["found_item"][0] for x in found_entities]

    def unique_matches(self, found_entities):
        # flattened = [{
        #                  "type": x["type"],
        #                  "key": x["key"],
        #                  "source": x["source"]
        #              } for entity in found_entities for x in entity["found_item"]]
        flattened = [{
                         "type": x["type"],
                         "key": x["key"],
                         "source": x["source"]
                     } for entity in found_entities for x in entity["found_item"]]
        return list({v['type'] + v['key']: v for v in flattened}.values())

    def unique_non_detections(self, can_not_match):
        return list(set(x["value"] for x in can_not_match))

    def format_found_entities(self, all_found):
        unique_items = []
        for current_item in all_found:
            if not any([x for x in all_found if current_item["position"] != x["position"] and current_item["start"] >= x["start"] and current_item["end"] <= x["end"]]):
                unique_items.append(current_item)
        return unique_items

    def detect_entities(self, vocab, preparation_result):
        found_entities = self.find_matches(3, preparation_result["tokens"], vocab)
        found = self.format_found_entities(found_entities["found"])

        autocorrected_query = self.autocorrect_query(
            preparation_result["used_query"],
            found
        )
        key_matches = self.key_matches(found)
        # unique_entities = self.unique_matches(found)
        res = {
            "detections": key_matches,
            "non_detections": self.unique_non_detections(found_entities["can_not_match"])
        }
        if autocorrected_query is not None:
            res["autocorrected_query"] = autocorrected_query

        return res
示例#6
0
class Detect(RequestHandler):
    brute_detector = None
    alias_data = None
    data_response = None
    param_extractor = None
    path_extractor = None
    entity_factory = None

    def data_received(self, chunk):
        pass

    def initialize(self, alias_data):
        from detect.data.response import Response
        self.data_response = Response()
        self.data_response.open_connection()
        self.alias_data = alias_data
        self.param_extractor = ParamExtractor(self)
        self.path_extractor = PathExtractor(self)
        self.entity_factory = EntityFactory(self.alias_data)
        self.brute_detector = Detector(self.alias_data)

    def on_finish(self):
        pass

    @asynchronous
    def post(self, *args, **kwargs):
        self.set_header('Content-Type', 'application/json')

        detection_id = ObjectId()

        app_log.info(
            "app=detection,function=detect,detection_id=%s,application_id=%s,session_id=%s,q=%s",
            detection_id, self.param_extractor.application_id(),
            self.param_extractor.session_id(), self.param_extractor.query())

        if False:
            url = "%smessage?v=%s&q=%s&msg_id=%s" % (
                WIT_URL, WIT_URL_VERSION,
                url_escape(self.param_extractor.query()), str(detection_id))
            r = HTTPRequest(url,
                            headers={"Authorization": "Bearer %s" % WIT_TOKEN})
            client = AsyncHTTPClient()
            client.fetch(r, callback=self.wit_call_back)
        else:
            date = datetime.now()
            outcomes = self.brute_detector.detect(self.param_extractor.query())
            self.data_response.insert(self.param_extractor.user_id(),
                                      self.param_extractor.application_id(),
                                      self.param_extractor.session_id(),
                                      detection_id,
                                      "brute",
                                      date,
                                      self.param_extractor.query(),
                                      outcomes=outcomes)

            self.set_status(202)
            self.set_header("Location", "/%s" % str(detection_id))
            self.set_header("_id", str(detection_id))
            self.finish()

            Worker(self.param_extractor.user_id(),
                   self.param_extractor.application_id(),
                   self.param_extractor.session_id(),
                   detection_id,
                   date,
                   self.param_extractor.query(),
                   self.param_extractor.skip_slack_log(),
                   detection_type="wit",
                   outcomes=outcomes).start()

    @asynchronous
    def get(self, detection_id, *args, **kwargs):
        data = self.data_response.get(
            self.path_extractor.detection_id(detection_id))
        if data is not None:
            self.set_header('Content-Type', 'application/json')
            self.set_status(200)
            self.finish(
                dumps({
                    "type": data["type"],
                    "q": data["q"],
                    "outcomes": data["outcomes"],
                    "_id": data["_id"],
                    "version": data["version"],
                    "timestamp": data["timestamp"]
                }))
        else:
            self.set_status(404)
            self.finish()

    def wit_call_back(self, response):
        data = json_decode(response.body)
        outcomes = []
        date = datetime.now()
        for outcome in data["outcomes"]:
            entities = []
            for _type in outcome["entities"].keys():
                if _type not in ["polite"]:
                    for value in outcome["entities"][_type]:
                        suggested = value[
                            "suggested"] if "suggested" in value else False
                        key = value["value"]["value"] if type(
                            value["value"]) is dict else value["value"]
                        entity = self.entity_factory.create(
                            _type, key, suggested)

                        # TODO this needs to be moved somewhere else preferably a seperate service call
                        entities.append(entity)

            outcomes.append({
                "confidence": outcome["confidence"] * 100,
                "intent": outcome["intent"],
                "entities": entities
            })

        self.data_response.insert(self.param_extractor.user_id(),
                                  self.param_extractor.application_id(),
                                  self.param_extractor.session_id(),
                                  ObjectId(data["msg_id"]),
                                  "wit",
                                  date,
                                  self.param_extractor.query(),
                                  outcomes=outcomes)

        self.set_status(202)
        self.set_header("Location", "/%s" % data["msg_id"])
        self.set_header("_id", data["msg_id"])
        self.finish()

        Worker(self.param_extractor.user_id(),
               self.param_extractor.application_id(),
               self.param_extractor.session_id(),
               ObjectId(data["msg_id"]),
               date,
               self.param_extractor.query(),
               self.param_extractor.skip_slack_log(),
               detection_type="wit",
               outcomes=outcomes).start()
示例#7
0
class Detector:
    def __init__(self, alias_data):
        self.POS_to_use = ["J", "N", "V", "R", "I"]
        self.extra_stop_words = ["show"]
        self.stopwords = stopwords.words('english')
        self.stopwords.extend(self.extra_stop_words)
        self.skip_words = [
            "all", "any", "anything", "display", "every", "everything", "find",
            "go", "like", "looking", "nice", "pair", "show", "some",
            "something", "want"
        ]
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.WordPunctTokenizer()
        self.alias_data = alias_data
        self.entity_factory = EntityFactory(self.alias_data)

    def detect_intent(self, preperation_result):
        return {"confidence": 80.0, "intent": "include"}

    def detect(self, q):
        preperation_result = self.preparation(q)
        outcome = self.detect_intent(preperation_result)
        outcome["entities"] = []
        entities = self.detect_entities(self.alias_data, preperation_result)
        for x in entities["detections"]:
            outcome["entities"].append(
                self.entity_factory.create(x["type"],
                                           x["key"],
                                           source=x["source"]))

        for x in entities["non_detections"]:
            outcome["entities"].append(
                self.entity_factory.create("unknown", x, confidence=10.0))

        return [outcome]

    def preparation(self, q):
        # used_query = q.lower().strip()
        used_query = q
        result = {"used_query": used_query, "original_query": q}
        raw_tokens = self.tokenizer.tokenize(used_query)
        token_spans = list(self.tokenizer.span_tokenize(used_query))
        tagged_words = nltk.pos_tag(raw_tokens)

        result["tokens"] = []
        for index, value in enumerate(raw_tokens):
            lower_value = value.lower()
            stop_word = lower_value in self.stopwords
            skip_word = lower_value in self.skip_words
            result["tokens"].append({
                "value":
                lower_value,
                "start":
                token_spans[index][0],
                "end":
                token_spans[index][1],
                "pos":
                tagged_words[index][1],
                "use":
                tagged_words[index][1][0] in self.POS_to_use and not stop_word
                and not skip_word,
                "stem":
                self.stemmer.stem(lower_value),
                "stop_word":
                stop_word,
                "skip_word":
                skip_word
            })

        return result

    def create_found_doc(self, term, tokens, found_item, start, end):
        return {
            "term": term,
            "tokens": tokens,
            "found_item": found_item,
            "start": start,
            "end": end,
            "position": "%s_%s" % (start, end)
        }

    def find_matches(self,
                     ngram_size: int,
                     tokens: list,
                     vocab,
                     can_not_match=None):
        if can_not_match is None:
            can_not_match = []
        res = {"found": [], "can_not_match": can_not_match}
        tokens_to_use = [x for x in tokens if x["use"]]
        n = min(len(tokens_to_use), ngram_size)
        for ngram in ngrams(tokens_to_use, n):
            ngram_term = " ".join(x["value"] for x in ngram)
            start = ngram[0]["start"]
            end = ngram[-1:][0]["end"]

            if ngram_term in vocab["en"]:
                # must be copied in this way
                key = "%s_%s" % (start, end)
                new_doc = self.create_found_doc(ngram_term,
                                                [x["value"] for x in ngram],
                                                vocab["en"][ngram_term], start,
                                                end)
                if not any(y for y in res["found"]
                           if new_doc["position"] == y["position"]):
                    res["found"].append(new_doc)
            elif n > 0:
                new_found_items = self.find_matches(
                    n - 1, tokens, vocab,
                    can_not_match=res["can_not_match"])["found"]
                res["found"].extend([
                    x for x in new_found_items
                    if not any(y for y in res["found"]
                               if x["position"] == y["position"])
                ])
            elif n == 0 and ngram[0]["use"] and ngram[0]["pos"][
                    0] in self.POS_to_use and ngram[0] not in res[
                        "can_not_match"]:
                res["can_not_match"].append(ngram[0])

        flattened_tokens = [
            x for entity in res["found"] for x in entity["tokens"]
        ]
        res["can_not_match"] = [
            x for x in res["can_not_match"]
            if x["value"] not in flattened_tokens
        ]
        return res

    def autocorrect_query(self, used_query, found_entities):
        corrected_query = used_query
        corrected = False
        # need to work from end of string backwards otherwise it gets messed up with adding/removing chars
        for entity in sorted(found_entities,
                             key=itemgetter("start"),
                             reverse=True):
            for x in [
                    x for x in entity["found_item"]
                    if x["match_type"] == "spelling"
            ]:
                corrected = True
                corrected_query = corrected_query[0:entity["start"]] + x[
                    "display_name"] + corrected_query[entity["end"]:]

        if corrected:
            return corrected_query
        else:
            return None

    def key_matches(self, found_entities):
        return [x["found_item"][0] for x in found_entities]

    def unique_matches(self, found_entities):
        # flattened = [{
        #                  "type": x["type"],
        #                  "key": x["key"],
        #                  "source": x["source"]
        #              } for entity in found_entities for x in entity["found_item"]]
        flattened = [{
            "type": x["type"],
            "key": x["key"],
            "source": x["source"]
        } for entity in found_entities for x in entity["found_item"]]
        return list({v['type'] + v['key']: v for v in flattened}.values())

    def unique_non_detections(self, can_not_match):
        return list(set(x["value"] for x in can_not_match))

    def format_found_entities(self, all_found):
        unique_items = []
        for current_item in all_found:
            if not any([
                    x for x in all_found
                    if current_item["position"] != x["position"]
                    and current_item["start"] >= x["start"]
                    and current_item["end"] <= x["end"]
            ]):
                unique_items.append(current_item)
        return unique_items

    def detect_entities(self, vocab, preparation_result):
        found_entities = self.find_matches(3, preparation_result["tokens"],
                                           vocab)
        found = self.format_found_entities(found_entities["found"])

        autocorrected_query = self.autocorrect_query(
            preparation_result["used_query"], found)
        key_matches = self.key_matches(found)
        # unique_entities = self.unique_matches(found)
        res = {
            "detections":
            key_matches,
            "non_detections":
            self.unique_non_detections(found_entities["can_not_match"])
        }
        if autocorrected_query is not None:
            res["autocorrected_query"] = autocorrected_query

        return res
示例#8
0
文件: detect.py 项目: rdefeo/detect
class Detect(RequestHandler):
    brute_detector = None
    alias_data = None
    data_response = None
    param_extractor = None
    path_extractor = None
    entity_factory = None

    def data_received(self, chunk):
        pass

    def initialize(self, alias_data):
        from detect.data.response import Response

        self.data_response = Response()
        self.data_response.open_connection()
        self.alias_data = alias_data
        self.param_extractor = ParamExtractor(self)
        self.path_extractor = PathExtractor(self)
        self.entity_factory = EntityFactory(self.alias_data)
        self.brute_detector = Detector(self.alias_data)

    def on_finish(self):
        pass

    @asynchronous
    def post(self, *args, **kwargs):
        self.set_header("Content-Type", "application/json")

        detection_id = ObjectId()

        app_log.info(
            "app=detection,function=detect,detection_id=%s,application_id=%s,session_id=%s,q=%s",
            detection_id,
            self.param_extractor.application_id(),
            self.param_extractor.session_id(),
            self.param_extractor.query(),
        )

        if False:
            url = "%smessage?v=%s&q=%s&msg_id=%s" % (
                WIT_URL,
                WIT_URL_VERSION,
                url_escape(self.param_extractor.query()),
                str(detection_id),
            )
            r = HTTPRequest(url, headers={"Authorization": "Bearer %s" % WIT_TOKEN})
            client = AsyncHTTPClient()
            client.fetch(r, callback=self.wit_call_back)
        else:
            date = datetime.now()
            outcomes = self.brute_detector.detect(self.param_extractor.query())
            self.data_response.insert(
                self.param_extractor.user_id(),
                self.param_extractor.application_id(),
                self.param_extractor.session_id(),
                detection_id,
                "brute",
                date,
                self.param_extractor.query(),
                outcomes=outcomes,
            )

            self.set_status(202)
            self.set_header("Location", "/%s" % str(detection_id))
            self.set_header("_id", str(detection_id))
            self.finish()

            Worker(
                self.param_extractor.user_id(),
                self.param_extractor.application_id(),
                self.param_extractor.session_id(),
                detection_id,
                date,
                self.param_extractor.query(),
                self.param_extractor.skip_slack_log(),
                detection_type="wit",
                outcomes=outcomes,
            ).start()

    @asynchronous
    def get(self, detection_id, *args, **kwargs):
        data = self.data_response.get(self.path_extractor.detection_id(detection_id))
        if data is not None:
            self.set_header("Content-Type", "application/json")
            self.set_status(200)
            self.finish(
                dumps(
                    {
                        "type": data["type"],
                        "q": data["q"],
                        "outcomes": data["outcomes"],
                        "_id": data["_id"],
                        "version": data["version"],
                        "timestamp": data["timestamp"],
                    }
                )
            )
        else:
            self.set_status(404)
            self.finish()

    def wit_call_back(self, response):
        data = json_decode(response.body)
        outcomes = []
        date = datetime.now()
        for outcome in data["outcomes"]:
            entities = []
            for _type in outcome["entities"].keys():
                if _type not in ["polite"]:
                    for value in outcome["entities"][_type]:
                        suggested = value["suggested"] if "suggested" in value else False
                        key = value["value"]["value"] if type(value["value"]) is dict else value["value"]
                        entity = self.entity_factory.create(_type, key, suggested)

                        # TODO this needs to be moved somewhere else preferably a seperate service call
                        entities.append(entity)

            outcomes.append(
                {"confidence": outcome["confidence"] * 100, "intent": outcome["intent"], "entities": entities}
            )

        self.data_response.insert(
            self.param_extractor.user_id(),
            self.param_extractor.application_id(),
            self.param_extractor.session_id(),
            ObjectId(data["msg_id"]),
            "wit",
            date,
            self.param_extractor.query(),
            outcomes=outcomes,
        )

        self.set_status(202)
        self.set_header("Location", "/%s" % data["msg_id"])
        self.set_header("_id", data["msg_id"])
        self.finish()

        Worker(
            self.param_extractor.user_id(),
            self.param_extractor.application_id(),
            self.param_extractor.session_id(),
            ObjectId(data["msg_id"]),
            date,
            self.param_extractor.query(),
            self.param_extractor.skip_slack_log(),
            detection_type="wit",
            outcomes=outcomes,
        ).start()