def praphrase_sentences( text, depth=int(constants.fetch_constant("language_depth")), project_id=constants.fetch_constant("google_project_id")): parent = client.location_path(project_id, "global") x = client.get_supported_languages(parent) target_laguages = [item.language_code for item in x.languages[:depth]] translated_text = [] for language in target_laguages: response = client.translate_text( parent=parent, contents=[text], mime_type='text/plain', # mime types: text/plain, text/html source_language_code='en-IN', target_language_code=language) for translation in response.translations: translated_text.append(translation.translated_text) result = [] for lg, sentence in zip(target_laguages, translated_text): response = client.translate_text( parent=parent, contents=[sentence], mime_type='text/plain', # mime types: text/plain, text/html source_language_code=str(lg), target_language_code="en") for translation in response.translations: result.append(translation.translated_text) return result
def match_page(): """ URL for UI for matching the sentence or org_snippets """ try: print(signal_service.make_cached_signals_product(constants.fetch_constant("default_task_id"))) prod_id = signal_service.make_cached_signals_product(constants.fetch_constant("default_task_id")) return render_template("match.html") except (sken_exceptions.NoSignalFound, sken_exceptions.NoProductFound)as exe: resp = Response(exe.message, status=500, mimetype='application/text') resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def make_snippets(df, snippet_ids, task_id): if len(df) != 0: sentences = df["text"].to_list() sentence_vectors = sken_singleton.Singletons.get_instance( ).perform_embeddings(sentences) vad_chunks = [] for i in range(len(df)): vad_chunks.append( VadChunk(snippet_ids[i], df["from_time"][i], df["to_time"][i], df["speaker"][i], df["text"][i], sentence_vectors[i], None, task_id, df["orignal_ids"][i], questions=None, q_encoding=None, encoding_method=constants.fetch_constant( "encoding_method"))) return vad_chunks else: return []
def make_product_signal(signal_tokens, scores, threshold, value, product_id): signal_token_lists = [] logger.info("Making signal_df") for token, score in zip(signal_tokens, scores): signal_token_lists.append({ 'val': pd.Series(make_root_word(signal_tokens[token])), 'score': int(score) }) df = pd.DataFrame(signal_token_lists) pickel_string = pickle.dumps(df) sql = "insert into public.product_signal (name, color, value, product_id, created_at, updated_at, is_active, " \ "type, engine, match_type, do_generate) values(%s, '#f09600', %s, %s, now(), now(), true, '', " \ "'RAZOR'" \ ", 'BOTH', false) returning id; " rows, col_names = db.DBUtils.get_instance().execute_query( sql, (constants.fetch_constant("signal_name"), value, product_id), is_write=True, is_return=True) sql = "INSERT INTO public.signal_generated (signal_id, text_, created_at, snippets_id, is_active) VALUES(%s, %s, " \ "now(), NULL, false); " db.DBUtils.get_instance().execute_query( sql, (rows[0][col_names.index("id")], value), is_write=True, is_return=False) sql = "INSERT INTO public.product_signal_file (product_signal_id, signal_file, threshold) VALUES(%s, %s, %s); " db.DBUtils.get_instance().execute_query( sql, (rows[0][col_names.index("id")], pickel_string, threshold), is_write=True, is_return=False) logger.info("Made signal entry in db")
def get_synonyms(sentence): """ This method breaks the sentence into tokens and gets the pos tags for them if the pos tag is not in the list of restricted token list it gets the synonyms for each token using any of the three methods @param sentence: @return: """ global not_accepted_pos if len(sentence.split()) > 0: tokens = get_tokens(sentence) logger.info("Made {} tokens for {}".format(len(tokens), sentence)) pos_tags = nltk.pos_tag(tokens) result = [] for tag in pos_tags: if tag[1] not in not_accepted_pos: result.append( get_synonyms_thesaurus( tag[0], int(constants.fetch_constant("max_synonims")))) else: result.append({tag[0]: []}) max_length = max([len(list(item.values())[0]) for item in result]) return {"data": result, "max_len": max_length} else: raise sken_exceptions.NoTokensFound
def __init__(self): if Singletons.__instance is not None: raise Exception("This class is a singleton!") else: logger.info(" Loading Summarization model") self.model = AutoModelWithLMHead.from_pretrained( constants.fetch_constant("model_path")) logger.info(" Loading Tokenizer for the model") self.tokenizer = AutoTokenizer.from_pretrained( constants.fetch_constant("model_path")) logger.info("Checking CUDA availability") if torch.cuda.is_available(): logger.info("Transferring model to gpu") self.device = "cuda" self.model.to(self.device) else: logger.info("CUDA not found using CPU") self.device = "cpu" Singletons.__instance = self
def sentence_matching(): sentence = request.form.get("sentence") vad_chunk = VadChunk(1, time.time(), time.time() + 1, "Agent", sentence, None, None, None) result = scoring_service.vad_chunk_match(vad_chunk, constants.fetch_constant("default_prod_id")) output = [] for item in result: output.append({"input_sentence": item.snippet_text, "signal": item.signal_text, "tokens": item.matched_tokens, "score": str(item.score), "threshold": item.threshold, "id": item.signal_id, "html_": ""}) resp = Response(jsonpickle.encode(output), mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def create_lq_maches(vad_chunks, threshold): """ This method returns the caught lead_qualification facets that are caught for each snippet, only one facet signal can be caught across all the facets """ caught_lq_facets = [] logger.info("Making caught facets for lead_qualification") for vad_chunk in vad_chunks: if vad_chunk.q_encoding is not None: for i, question in enumerate(vad_chunk.questions): scores = np.zeros( shape=(len(sken_singleton.Singletons.get_instance(). get_cached_lq_dims()), max([ len(x.facet_signals) for x in sken_singleton.Singletons.get_instance( ).get_cached_lq_dims().values() ]))) for x, facet in enumerate(sken_singleton.Singletons. get_instance().get_cached_lq_dims()): for y, facet_signal in enumerate( sken_singleton.Singletons.get_instance( ).get_cached_lq_dims()[facet].facet_signals): score = ( np.dot([vad_chunk.q_encoding[i]], np.array(facet_signal.embedding).T) / (np.linalg.norm([vad_chunk.q_encoding[i]]) * np.linalg.norm(facet_signal.embedding)))[0][0] scores[x, y] = score if np.amax(scores) >= float(threshold): facet_index, facet_signal_index = np.where(scores == np.amax(scores))[0][0], \ np.where(scores == np.amax(scores))[1][0] facet = sken_singleton.Singletons.get_instance( ).get_cached_lq_dims()[list( sken_singleton.Singletons.get_instance( ).get_cached_lq_dims().keys())[facet_index]] facet_signal = facet.facet_signals[facet_signal_index] caught_lq_facets.append( CaughtFacetSignals( vad_chunk, vad_chunk.text, question, facet.name, facet_signal, facet_signal.text, np.amax(scores), constants.fetch_constant("encoding_method"), "Lead-Qualification")) return caught_lq_facets
def make_snippet_question_embeddings(vad_chunk): """ Sets the sentence embedding of snippet questions if present else sets it to None :param vad_chunk: :return: None """ if vad_chunk.questions is not None: vad_chunk.set_question_encoding( sken_singleton.Singletons.get_instance().perform_embeddings( vad_chunk.questions), constants.fetch_constant("encoding_method")) logger.info( "Calculated embeddings for {} snippet questions for snippet_id ={}" .format(len(vad_chunk.questions), vad_chunk.sid)) else: logger.info( "There were not snippet questions for snippet_id={}".format( vad_chunk.sid)) vad_chunk.set_question_encoding(None, None)
def __init__(self): if DBUtils.__instance is not None: raise Exception("This is a singleton class ") else: logger.info( "Initializing connection pool for database connection, should happen only once during startup. with {}".format( constants.fetch_constant("host"))) self.sales_pool = pool.ThreadedConnectionPool(constants.fetch_constant("min_pool"), constants.fetch_constant("max_pool"), host=constants.fetch_constant("host"), user="******", password=constants.fetch_constant("password"), port="5432", database=constants.fetch_constant("db_name")) logger.info("Made {} max_connections ".format(self.sales_pool.maxconn)) DBUtils.__instance = self
import time from google.cloud import translate from concurrent.futures import ThreadPoolExecutor import multiprocessing from src.utilities import constants, sken_logger, db import spacy import textacy logger = sken_logger.get_logger("sentence_services") nlp = spacy.load("en_core_web_sm") client = translate.TranslationServiceClient() parent = client.location_path(constants.fetch_constant("translate_project_id"), "global") target_laguages = [ item.language_code for item in client.get_supported_languages( parent).languages[:constants.fetch_constant("translation_depth")] ] def paraphrase_sentence(text): global parent, target_laguages def get_the_other(language): response = client.translate_text( parent=parent, contents=[text], mime_type='text/plain', # mime types: text/plain, text/html source_language_code='en-IN',
def upload_csv(): global tmp_pro_id, request_count if request.method == "POST": if 'file' not in request.files: flash('No file part') return redirect(request.url) file = request.files['file'] threshold = request.form.get("threshold") org_id = request.form.get("organization") product_id = request.form.get("product_id") if org_id: if file.filename == '': flash('No selected file') return redirect(request.url) input_filename = secure_filename(file.filename) input_file_path = os.path.join( app.config[constants.fetch_constant("UPLOAD_FOLDER")], input_filename) if os.path.exists(input_file_path): logger.info("File path {} already exists so removing this file".format( input_file_path)) os.remove(input_file_path) logger.info("Making new file {}".format(input_file_path)) file.save(input_file_path) if request_count == 0: logger.info( "This is the first request for organization={} and product={}". format(org_id, product_id)) tmp_pro_id = product_id request_count += 1 resp = Response(jsonpickle.encode( dimension_engine.wraper_method(input_file_path, org_id, product_id, threshold)), mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' elif request != 0 and tmp_pro_id != product_id: logger.info( "First request for organization={} and product={} clearing the cache_facets for old_product={}" .format(org_id, product_id, tmp_pro_id)) dimension_engine.refresh_cached_dims(org_id, product_id) request_count = 1 tmp_pro_id = product_id resp = Response(jsonpickle.encode( dimension_engine.wrapper_method(input_file_path, org_id, threshold)), mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' else: request_count += 1 logger.info( "This is {} request for organization={} and product={}".format( request_count, org_id, tmp_pro_id)) resp = Response(jsonpickle.encode( dimension_engine.wrapper_method(input_file_path, org_id, threshold)), mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def make_cached_dimensions(org_id, prod_id): """ This method caches the facet signals for the particular product and organization :param org_id: :param prod_id: :return: """ if len(sken_singleton.Singletons.get_instance( ).get_cached_lq_dims()) == 0 or len(sken_singleton.Singletons.get_instance( ).get_cached_intro_dims()) == 0: logger.info( "Creating cached_dimensions for organization={} and product={}". format(org_id, prod_id)) sql = "select dimension.id as dimid,dimension.name_ as dimname,facet.id as facet_id,facet.name_ as " \ "facet_name,facet_signal.id as fsid,facet_signal.value as fsval,generated_facet_signals.id as gsid," \ "generated_facet_signals.value as gs_value,facet_signal.org_id,facet_signal.product_id from dimension " \ "left join facet on facet.dim_id = dimension.id left join facet_signal on facet_signal.facet_id = " \ "facet.id left join generated_facet_signals on generated_facet_signals.facet_signal_id = " \ "facet_signal.id where facet_signal.org_id=%s and facet_signal.product_id=%s group by dimension.id," \ "facet.id,facet_signal.id,generated_facet_signals.id " rows, col_names = DBUtils.get_instance().execute_query( sql, (org_id, prod_id), is_write=False, is_return=True) kvp_id = as_id = a_id = b_id = i_id = n_id = None if len(rows) != 0: start = time.time() logger.info( "Making cache facet signals for organization= {} and product={}" .format(org_id, prod_id)) kvp_facet_signals = [] as_facet_signals = [] authority_facet_singals = [] budget_facte_singals = [] interest_face_signals = [] need_facet_singals = [] for row in rows: if str(row[col_names.index( "dimname")]).lower() == "introduction": if str(row[col_names.index( "facet_name")]).lower() == "key value proposition": kvp_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: kvp_facet_signals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: kvp_facet_signals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) else: as_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: as_facet_signals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: as_facet_signals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) else: if str(row[col_names.index( "facet_name")]).lower() == "authority": a_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: authority_facet_singals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: authority_facet_singals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) elif str(row[col_names.index( "facet_name")]).lower() == "budget": b_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: budget_facte_singals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: budget_facte_singals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) elif str(row[col_names.index( "facet_name")]).lower() == "interest": i_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: interest_face_signals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: interest_face_signals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) else: n_id = row[col_names.index("fsid")] if row[col_names.index("gs_value")] is not None: need_facet_singals.append( FacetSignal( row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=sken_singleton.Singletons. get_instance().perform_embeddings( row[col_names.index("gs_value")]), embedding_method=constants.fetch_constant( "encoding_method"))) else: need_facet_singals.append( FacetSignal(row[col_names.index("gsid")], row[col_names.index("gs_value")], row[col_names.index("fsid")], embedding=None, embedding_method=None)) with ThreadPoolExecutor(max_workers=6) as executor: executor.submit( sken_singleton.Singletons.get_instance(). set_cached_intro_dims, "key_value_proposition", Facet(kvp_id, "key value proposition", kvp_facet_signals)) executor.submit( sken_singleton.Singletons.get_instance(). set_cached_intro_dims, "aspiration_setting", Facet(as_id, "aspiration setting", as_facet_signals)) executor.submit( sken_singleton.Singletons.get_instance(). set_cached_lq_dims, "authority", Facet(a_id, "authority", authority_facet_singals)) executor.submit( sken_singleton.Singletons.get_instance(). set_cached_lq_dims, "budget", Facet(b_id, "budget", budget_facte_singals)) executor.submit( sken_singleton.Singletons.get_instance(). set_cached_lq_dims, "interest", Facet(i_id, "interest", interest_face_signals)) executor.submit( sken_singleton.Singletons.get_instance(). set_cached_lq_dims, "need_investigation", Facet(n_id, "need investigation", need_facet_singals)) logger.info( "Cached {} facet signals for org={} and product={} in {}". format( len(kvp_facet_signals + as_facet_signals + authority_facet_singals + interest_face_signals + budget_facte_singals + need_facet_singals), org_id, prod_id, (time.time() - start))) else: logger.info( "No facet_signals found for organization={} and product={}". format(org_id, prod_id)) raise sken_exceptions.NoFacetFound(org_id, prod_id) else: logger.info( "Skipping caching of facet_signals for organization={} and product_id ={}, they already exist in RAM" .format(org_id, prod_id))
from flask import Flask, request, Response, render_template, flash, redirect, send_file from werkzeug.utils import secure_filename from src.utilities import sken_logger, db, sken_singleton, constants from src.services import dimension_engine from src.services import facet_service logger = sken_logger.get_logger("main") sken_singleton.Singletons.get_instance() db.DBUtils.get_instance() tmp_pro_id = None # used to catch and reset the catch if new product request is made request_count = 0 app = Flask(__name__) app.config['UPLOAD_FOLDER'] = constants.fetch_constant("upload_folder") @app.route('/') def index(): return render_template('index.html') @app.route("/upload_file", methods=["POST", "GET"]) def upload_csv(): global tmp_pro_id, request_count if request.method == "POST": if 'file' not in request.files: flash('No file part') return redirect(request.url)