Exemplo n.º 1
0
import numpy as np
import pandas as pd
from src.utilities.objects import VadChunk
from src.utilities import sken_singleton, sken_logger, constants
from src.services import question_detection

logger = sken_logger.get_logger("snippet_service")

# def agent_customer_sequence(input_excel_file):
#     df = pd.read_excel(input_excel_file)
#     df.text_ = df.text.astype(str)
#     df['a_bin'] = 0
#     df['b_bin'] = 0
#     df.a_bin = df.speaker.apply(lambda x: 0 if x == 'Agent' else 1)
#     df.b_bin = df.speaker.apply(lambda x: 0 if x == 'Customer' else 1)
#     df['a_bin_cumsum'] = df.a_bin.cumsum()
#     df['b_bin_cumsum'] = df.b_bin.cumsum()
#     df = df.drop(['a_bin', 'b_bin'], axis=1)
#     df['a_bin'] = df.speaker.apply(lambda x: 1 if x == 'Agent' else 0)
#     df['b_bin'] = df.speaker.apply(lambda x: 1 if x == 'Customer' else 0)
#     df['a_con'] = df.a_bin_cumsum * df.a_bin
#     df['b_con'] = df.b_bin_cumsum * df.b_bin
#     df.drop(['a_bin_cumsum', 'b_bin_cumsum', 'a_bin', 'b_bin'], axis=1, inplace=True)
#     df['identifier'] = df.a_con + df.b_con
#     df['name_idnet'] = df.speaker + "_" + df.identifier.astype(str)
#     df.drop(['a_con', 'b_con'], axis=1, inplace=True)
#     df['text_'] = df['text'] + ". "
#     df1 = df[['name_idnet', 'text_']].groupby(['name_idnet'], as_index=False).sum()
#     df2 = df.drop_duplicates("name_idnet")[['speaker', 'name_idnet']]
#     df2 = df2.merge(df1, on='name_idnet')
#     df2 = df2.drop(["name_idnet"], axis=1)
Exemplo n.º 2
0
import os

import jsonpickle
from flask import Flask, request, Response, render_template, flash, redirect
from werkzeug.utils import secure_filename

from src.utilities import sken_logger, db, sken_singleton, constants
from src.services import dimension_engine
from src.services import facet_service

logger = sken_logger.get_logger("main")

sken_singleton.Singletons.get_instance()
db.DBUtils.get_instance()
tmp_pro_id = None  # used to catch and reset the catch if new product request is made
request_count = 0

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('index.html')


@app.route("/upload_file", methods=["POST", "GET"])
def upload_csv():
    global tmp_pro_id, request_count

    if request.method == "POST":
        if 'file' not in request.files:
Exemplo n.º 3
0
import time

from google.cloud import translate
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
from src.utilities import constants, sken_logger, db
import spacy
import textacy

logger = sken_logger.get_logger("sentence_services")

nlp = spacy.load("en_core_web_sm")

client = translate.TranslationServiceClient()
parent = client.location_path(constants.fetch_constant("translate_project_id"),
                              "global")
target_laguages = [
    item.language_code for item in client.get_supported_languages(
        parent).languages[:constants.fetch_constant("translation_depth")]
]


def paraphrase_sentence(text):
    global parent, target_laguages

    def get_the_other(language):
        response = client.translate_text(
            parent=parent,
            contents=[text],
            mime_type='text/plain',  # mime types: text/plain, text/html
            source_language_code='en-IN',
Exemplo n.º 4
0
import os

import pandas as pd
from google.cloud import translate
from src.utilities import db, constants, sken_logger

client = translate.TranslationServiceClient()
logger = sken_logger.get_logger("facet_signal")


def make_facet_signal_entries(file_path, org_id, prod_id):
    df = pd.read_excel(file_path)
    for facet_signal in range(len(df)):
        sql = """insert into facet_signal (value, facet_id, org_id, product_id) values(%s, (select 
        id from facet where name_ = %s), %s, %s) """
        db.DBUtils.get_instance().execute_query(
            sql,
            (df.text[facet_signal], df.facet[facet_signal], org_id, prod_id),
            is_write=True,
            is_return=False)


def praphrase_sentences(
    text,
    depth=int(constants.fetch_constant("language_depth")),
    project_id=constants.fetch_constant("google_project_id")):
    parent = client.location_path(project_id, "global")
    x = client.get_supported_languages(parent)
    target_laguages = [item.language_code for item in x.languages[:depth]]
    translated_text = []
    for language in target_laguages:
from src.utilities import sken_logger, sken_singleton
from src.services import encoder

logger = sken_logger.get_logger("Sequence_match")


def interogation_detectore(sentence):
    ver_tags = ["VBZ", "VB", "VBD", "VBG", "VBN", "VBP"]
    doc = sken_singleton.Singletons.get_instance().get_nlp(sentence)
    matches = sken_singleton.Singletons.get_instance().get_phrase_matcher(doc)
    if len(matches) != 0:
        first_filter = doc[matches[0][1]:]
        tag_list = [item.tag_ for item in first_filter]
        dep_list = [item.dep_ for item in first_filter]

        if "subj" in first_filter[0].dep_ or "obj" in first_filter[0].dep_:
            return {
                "question": str(first_filter),
                "subject": "will be in answer",
                "tag": "direct-question without subject"
            }

        try:
            flag = False
            for i in range(1, len(tag_list)):
                for j in range(i + 1, len(tag_list)):
                    if tag_list[i] in ver_tags and "nsubj" in dep_list[j]:
                        print(j, dep_list[j])
                        flag = True
                        subject_index = j
                        break
from src.utilities import sken_logger, constants
from laserembeddings import Laser

logger = sken_logger.get_logger("sken_singleton")


class Singletons:
    __instance = None
    laser_embedder = None

    @staticmethod
    def get_instance():
        """Static access method"""
        if Singletons.__instance is None:
            logger.info("Calling private constructor for embedder initialization ")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
        else:
            logger.info("Initializing Laser embedder")
            self.laser_embedder = Laser()
            Singletons.__instance = self

    def perform_embeddings(self, all_sentences):
        """
        This method embeds all the sentences passed using Laser embedder
        :param all_sentences:
        :return: list of sentence embeddings
from src.utilities import sken_logger, sken_singleton
from src.services import sentence_services
from concurrent.futures import ThreadPoolExecutor
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json

logger = sken_logger.get_logger("match_result")


def make_result(signal, snippet, threshold):
    logger.info("Generating sentences for signal={}".format(signal))
    start = time.time()
    generated_signals = sentence_services.paraphrase_sentence(signal)
    logger.info("Extracting sentences out of {} signals".format(
        len(generated_signals) + 1))
    with ThreadPoolExecutor(max_workers=len(generated_signals) + 1) as exe:
        future = exe.map(sentence_services.get_extracted_sentences,
                         list(set(generated_signals + [signal])))
    extracted_sentences = []
    for item in future:
        extracted_sentences.extend(item['extracted_sentences'])
    paraphrased_signals = []
    for signal in list(set(generated_signals)):
        paraphrased_signals.append({"signal_tag": "para", "signal": signal})
    extracted_signals = []
    for signal in list(set(extracted_sentences)):
        extracted_signals.append({"signal_tag": "extracted", "signal": signal})
    logger.info("Final signal count={}".format(
        len(paraphrased_signals) + len(extracted_signals)))
Exemplo n.º 8
0
import psycopg2
import sys
from src.utilities import constants
from src.utilities import sken_logger
from inspect import getframeinfo, stack
from psycopg2 import pool
from psycopg2.extras import execute_values
import re

logger = sken_logger.get_logger("db")


class DBUtils:
    __instance = None
    sales_pool = None

    @staticmethod
    def get_instance():
        """ Static access method. """
        if DBUtils.__instance is None:
            logger.info("Calling private constructor for connection pool initialization")
            DBUtils()
        return DBUtils.__instance

    def __init__(self):
        if DBUtils.__instance is not None:
            raise Exception("This is a singleton class ")
        else:
            logger.info(
                "Initializing connection pool for database connection, should happen only once during startup. with {}".format(
                    constants.fetch_constant("host")))
Exemplo n.º 9
0
import multiprocessing
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

import pandas as pd

from src.services import text_service, signal_service
from src.utilities import sken_singleton, sken_logger, db
from src.utilities.objects import VadChunk, Match

logger = sken_logger.get_logger("scoring_service")


def vad_chunk_match(vad_chunk, product_id):
    tokens = text_service.make_root_word(
        text_service.get_tokens(vad_chunk.text))
    if len(tokens) > 0:
        logger.info("Made {} token for snippet = {}".format(
            len(tokens), vad_chunk.sid))
        signals = sken_singleton.Singletons.get_instance().get_cached_signals(
        )[str(product_id)]

        def get_signal_scoring(signal):
            signal_df = signal.token_df
            threshold = signal.threshold
            matched = []
            matched_vals = []
            score = 0
            for tok in tokens:
                for i, val in enumerate(signal_df.val):
                    if val.isin([tok]).any() and tok not in matched_vals:
Exemplo n.º 10
0
from src.utilities import constants, db, sken_logger, sken_singleton, sken_exceptions
from src.utilities.objects import Signal
import pandas as pd
import os
import pickle
from src.services.text_service import make_root_word

logger = sken_logger.get_logger('signal_service')


def make_product_signal(signal_tokens, scores, threshold, value, product_id):
    signal_token_lists = []
    logger.info("Making signal_df")
    for token, score in zip(signal_tokens, scores):
        signal_token_lists.append({
            'val':
            pd.Series(make_root_word(signal_tokens[token])),
            'score':
            int(score)
        })

    df = pd.DataFrame(signal_token_lists)
    pickel_string = pickle.dumps(df)
    sql = "insert into public.product_signal (name, color, value, product_id, created_at, updated_at, is_active, " \
          "type, engine, match_type, do_generate) values(%s, '#f09600', %s, %s, now(), now(), true, '', " \
          "'RAZOR'" \
          ", 'BOTH', false) returning id; "

    rows, col_names = db.DBUtils.get_instance().execute_query(
        sql, (constants.fetch_constant("signal_name"), value, product_id),
        is_write=True,
Exemplo n.º 11
0
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from src.utilities import sken_logger, constants, sken_exceptions
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

logger = sken_logger.get_logger("text_service")

not_accepted_pos = ["DT", "VBZ", "PRP", "VBP", "MD", "VB", "IN"]


def get_tokens(sentence):
    """
    This method produces the tokens from a sentence
    :param sentence:
    :return: token list
    """
    return word_tokenize(sentence)


def get_synonyms(sentence):
    """
    This method breaks the sentence into tokens and gets the pos tags for them if the pos tag is not in the list of
    restricted token list it gets the synonyms for each token using any of the three methods
    @param sentence:
    @return:
Exemplo n.º 12
0
from src.utilities import sken_logger
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher

import os

logger = sken_logger.get_logger("Singleton")


class Singletons:
    __instance = None
    tagger = None
    nlp = None
    sequence_idx = None
    phrase_matcher = None

    @staticmethod
    def get_instance():
        """ Static access method. """
        if Singletons.__instance is None:
            logger.info("Calling Singletone private constructor")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("This class is a singleton!")
        else:

            logger.info("Initializing token tagger")
import numpy as np

from src.utilities import sken_singleton, sken_logger, constants, sken_exceptions
import os
import pandas
from src.utilities.db import DBUtils
from src.utilities.objects import FacetSignal, Facet, CaughtFacetSignals
from src.services import snippet_service, facet_service

from concurrent.futures import ThreadPoolExecutor
from multiprocessing.pool import ThreadPool

pool = ThreadPool(2)

logger = sken_logger.get_logger("dimension_engine")


def refresh_cached_dims(org_id, prod_id):
    """
    This method refreshes the cached_dimensions singleton ,the method clears the cached dimensions whenever a new
    product request is made
    :return:
    """

    with ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(sken_singleton.Singletons.get_instance().
                        get_cached_lq_dims().clear())
        executor.submit(sken_singleton.Singletons.get_instance().
                        get_cached_lq_dims().clear())
    logger.info(
import spacy

from src.utilities import sken_logger, sken_singleton
from textblob import TextBlob
import re

logger = sken_logger.get_logger("Encodeer")


def return_clean_text(sentence):
    logger.info("Cleaning sentence:{}".format(sentence))
    return re.sub(r'[^a-zA-Z ]+', '', sentence).lower()


def sentence_breaker(sentence):
    """
    This method extracts all the sentences present in a single long sentence using TextBlob
    @param sentence: str
    @return: list of sentences present
    """
    if len(sentence.split()) > 0:
        testimonial = TextBlob(sentence)
        sentences = []
        for sent in testimonial.sentences:
            sentences.append(str(sent))
        return sentences


def get_tagged_sequence(sentence):
    clean_text = return_clean_text(sentence)
    tagger = sken_singleton.Singletons.get_instance().get_tagger()