def update_snippet_counts(file_id, count): con = get_engine() con.execute( text( "update file_uploads set reviews_uploaded = (select count(*) from source_reviews " "where file_id=file_uploads.id), " "review_fragments_generated = {};".format( count)).execution_options(autocommit=True)) con.execute( text( "update account_subscription set reviews_uploaded = (select count(*) from source_reviews sr " "join file_uploads fp on fp.id=sr.file_id " "join customer cr on cr.id=fp.customer_id where cr.account_id=account_subscription.account_id), " "total_fragment_generated = (select sum(review_fragments_generated) from file_uploads fp " "join customer cr on cr.id=fp.customer_id " "where cr.account_id=account_subscription.account_id), " "category_count=(select count(distinct sr.category_id) from source_reviews sr " "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id " "where cr.account_id=account_subscription.account_id);"). execution_options(autocommit=True)) # con.execute("update file_uploads set status='3' where id in ({});".format(''.join(file_ids))) con.dispose()
def get_file_token(file_id): is_id = check_record('file_uploads', 'id', file_id) if not is_id: return False con = get_engine() file_token = con.execute( "select file_token from file_uploads where id={};".format(file_id)) file_token = list(file_token.fetchone())[0] return file_token
def get_columns(table): con = get_engine() cols = con.execute("desc {};".format(table)) cols = [ x[0] for x in list(cols) if x and x[0] not in [u'id', u'created_date', u'modified_date'] ] con.dispose() return cols
def get_colmap(account_id): con = get_engine() col_maps = con.execute( "select source_column_name, target_column_name from source_review_column_map " "where account_id={};".format(account_id)) col_maps = dict([(x[0].strip(), x[1].strip()) for x in col_maps.fetchall()]) con.dispose() return col_maps
def sentiment_upload(file_name, table, file_id): con = get_engine() data = pd.read_csv(file_name, sep='~', quoting=csv.QUOTE_ALL, encoding='utf-8') data['file_id'] = int(file_id) data = data[[ 'category_id', 'file_id', 'source_review_id', 'source_product_id', 'aspect', 'sentiment_type', 'sentiment_text', 'start_index', 'end_index', 'confidence-score' ]] data.rename(columns={ 'source_review_id': 'review_id', 'source_product_id': 'product_id', 'aspect': 'aspect_id', 'confidence-score': 'confidence_score' }, inplace=True) con.execute("delete from {} where file_id in ({});".format(table, file_id)) print "Upload is started" data = data.sort_values(['confidence_score'], ascending=False) data = data.drop_duplicates( ['product_id', 'review_id', 'aspect_id', 'sentiment_text'], keep='first') data.to_sql(table, if_exists='append', con=con, index=False, chunksize=1000) # con.execute("update file_uploads set status='3' where id={};".format(file_id)) print "Upload is completed for file {}".format(file_id) con.execute( "update file_uploads set status=2 where id = {};".format(file_id)) # Send notification mail to the user # user_mail = con.execute("select email, first_name, file_token from customer c join file_uploads f on " # "f.customer_id=c.id where f.id={};".format(file_id)) # user_mail, user_name, file_token = user_mail.fetchone() # Close the connection con.dispose() # try: # requests.get("http://192.168.2.112:8080/completeAnalysis?fileToken={}".format(file_token), timeout=5) # # send_mail.send_mail('*****@*****.**', [user_mail], "Completed processing your file!", "Dear {}, # # <br/> Successfully processed your reviews file and extracted the sentiment snippets." "<br/>You can # # download the file from the downloads section!".format(user_name)) # except: # pass return True
def update_column(file_id, file_url): con = get_engine() try: file_url = file_url.split("?")[0] query = """update file_uploads set download_url="{}" where id={};""".format( file_url, file_id) con.execute(query) time.sleep(1) con.dispose() return True except Exception as Ex: con.dispose() logging.error("Failed to update download url!\n{}".format(str(Ex))) return False
def check_record(table, column, value): con = get_engine() recs = 0 if str(value).isdigit(): recs = con.execute("select * from {} where {}={};".format( table, column, value)) else: recs = con.execute("select * from {} where {}='{}';".format( table, column, value)) con.dispose() if len(list(recs)): return True else: return False
def review_uploader(file_name, table, file_id, category_id): con = get_engine() data = pd.read_csv(file_name, sep='~', quoting=csv.QUOTE_ALL, encoding='utf-8') data['file_id'] = int(file_id) data.rename(columns={ 'source_review_id': 'review_id', 'source_product_id': 'product_id', 'review_tag': 'review_title' }, inplace=True) data['category_id'] = category_id data = data[[ 'product_id', 'review_id', 'product_name', 'reviewer_name', 'star_rating', 'review_date', 'review_title', 'category_id', 'review_url', 'review_text', 'file_id' ]] data.to_sql(table, if_exists='append', con=con, index=False, chunksize=1000) con.execute( "update file_uploads set reviews_uploaded = (select count(*) from source_reviews " "where file_id=file_uploads.id);") con.execute( "update account_subscription set reviews_uploaded = (select count(*) from source_reviews sr " "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id " "where cr.account_id=account_subscription.account_id), " "category_count=(select count(distinct sr.category_id) from source_reviews sr " "join file_uploads fp on fp.id=sr.file_id join customer cr on cr.id=fp.customer_id " "where cr.account_id=account_subscription.account_id);") # Close the connection con.dispose()
def table_to_file(table, file_id, sep=','): con = get_engine() table_recs = pd.read_sql( "select o.*, sr.review_text from {} o join source_reviews sr on sr.file_id=o.file_id " "and sr.review_id=o.review_id where o.file_id={} " "and sr.file_id={};".format(table, file_id, file_id), con=con) reviews = pd.read_sql( "select review_id from source_reviews where file_id={};".format( file_id), con=con) aspect_names = pd.read_sql( "select aspect_id, aspect_name " "from client_category_aspect " "where (account_id, category_id) in (select c.account_id, f.category_id " "from file_uploads f join customer c on c.id=f.customer_id " "where f.id={});".format(file_id), con=con) cat_aspect_names = pd.read_sql( "select id as aspect_id, aspect_name from category_aspect where category_id in " "(select category_id from file_uploads where id={});".format(file_id), con=con) # Coercing to int because the db column of aspect_id is varchar as we were using aspect name table_recs.aspect_id = table_recs.aspect_id.map(int) # separate the validate status 'Y' and 'N' error_log = table_recs[~(table_recs.validate_status == 'Y')] table_recs = table_recs[(table_recs.validate_status == 'Y')] # Merge the aspect names based on aspect_id table_recs = table_recs.merge(aspect_names, on=['aspect_id'], how='left') error_log = error_log.merge(cat_aspect_names, on=['aspect_id'], how='left') # Extracting counts snippet_count = table_recs.shape[0] review_count = reviews.shape[0] positive = table_recs[table_recs.sentiment_type == 'positive'].confidence_score negative = table_recs[table_recs.sentiment_type == 'negative'].confidence_score pmean, pstd = positive.mean(), positive.std() nmean, nstd = negative.mean(), negative.std() norm = abs(pmean - nmean) / (pmean + nmean) if norm > 0.2: bias_msg = "Positive and negative confidence_score distributions are non-matching." elif pmean < 0.5: bias_msg = "Positive confidence_score distribution is biased towards zero." elif nmean < 0.5: bias_msg = "Negative confidence_score distribution is biased towards zero." else: bias_msg = "Confidence Score distribution seems good." del table_recs['aspect_id'] del table_recs['id'] try: del table_recs['validate_status'] del table_recs['validate_comment'] except: pass category_name = con.execute( "select c.category_name, f.file_name from category c join file_uploads f " "on f.category_id=c.id where f.id={};".format(file_id)) category_name, raw_file_name = category_name.fetchone() con.dispose() base_name = "{}_{}_output.csv".format(raw_file_name, file_id) base_name2 = "{}_{}_error_log.csv".format(raw_file_name, file_id) file_name = app.saas_config['downloads_dir'] + base_name file_name2 = app.saas_config['downloads_dir'] + base_name2 table_recs.to_csv(file_name, sep=sep, quoting=csv.QUOTE_ALL, encoding='utf8', index=False) error_log.to_csv(file_name2, sep=sep, quoting=csv.QUOTE_ALL, encoding='utf8', index=False) zip_file_path = app.saas_config['downloads_dir'] + base_name.replace( '.csv', '.zip') zip_status = zip_files([file_name, file_name2], zip_file_path) if zip_status['errors']: raise (ValueError, "Failed in zipping the files!") try: os.remove(file_name) os.remove(file_name2) except: pass return zip_file_path, review_count, snippet_count, pmean, nmean, bias_msg
def file_upload(file_path, table, sep=',', req_ip="kiwi.enixta.com"): con = get_engine() try: records = pd.read_csv(file_path, sep=sep, quoting=csv.QUOTE_ALL) except parsers.ParserError: try: records = pd.read_csv(file_path, sep=sep, quotechar='"', escapechar='\\') except Exception as Ex: con.dispose() return { "errors": [ "There was some error reading the file! Please format the file with proper quoting and " "delimiter as ','(comma)." ], "messages": [ "There was some error reading the file! Please format the file with proper quoting " "and delimiter as ','(comma)." ] } records = records[~(records.file_id.isnull())] file_ids = map(int, map(float, map(str, list(records.file_id.unique())))) # Drop the unwanted columns records = records[[ 'category_id', 'file_id', 'review_id', 'product_id', 'aspect_name', 'sentiment_type', 'sentiment_text', 'start_index', 'end_index', 'confidence_score' ]] # Strip the starting and trailing spaces for text fields records = records.apply( lambda row: [x.strip() if isinstance(x, (str, unicode)) else x for x in row], axis=1) # Check the existence of file_id in db missing_file_ids = [] for file_id in file_ids: is_file_id = check_record('file_uploads', 'id', file_id) if not is_file_id: missing_file_ids.append(file_id) if missing_file_ids: con.dispose() return { "errors": "Missing file_ids in database {}! ".format(','.join( map(str, missing_file_ids))), "messages": "" } # check for multiple file ids in the file if len(file_ids) > 1: con.dispose() return { "errors": "Multiple file ids found in the file {}.".format(file_path), "messages": "" } # Raise error if there are no file ids present if len(file_ids) == 0: con.dispose() return { "errors": "File id column is empty in the file {}.".format(file_path), "messages": "" } # get the unique aspect names from the file aspect_names = list(records.aspect_name.unique()) # check for the missing / extra aspects present in the file extra_aspects = check_aspects(file_ids[0], aspect_names, con) if extra_aspects: # if extra aspects present in the file return an error con.dispose() return { "errors": "Not required aspect names are present in the file!\n{}".format( ','.join(map(str, extra_aspects))), "messages": "" } # Todo: Need to add the error return code to polarity also # remove the not required polarity snippets in the file records = records[records.sentiment_type.isin(['positive', 'negative'])] # Delete the entries from the db table if pre-existing if file_ids: # Todo: Try to suppress the reference check in db con.execute("delete from {} where file_id in ({});".format( table, ','.join(map(str, file_ids)))) # Todo: make db upload async records.to_sql(table, if_exists='append', con=con, index=False, chunksize=1000) file_id = file_ids[0] user_mail = con.execute( "select email, first_name, file_token from customer c join file_uploads f on " "f.customer_id=c.id where f.id={};".format(file_id)) user_mail, user_name, file_token = user_mail.fetchone() final_files = {'id': file_id, 'token': file_token} con.dispose() return { "data": final_files, "errors": "", "messages": "Successfully uploaded the file to {}! " "Valid record count is {}.".format( table.replace('_', ' ').title(), records.shape[0]) }
def duplicate_merge(file_id, level=1): db = get_engine() file_name = "downloads/index_merged_{}_{}.csv".format(file_id, level) if level == 1: snippet_table = 'sentiment_output_temp' elif level == 2: snippet_table = 'sentiment_ouput' reviews_without_text = pd.read_sql( "select * from {} where file_id={};".format(snippet_table, file_id), con=db) reviews_without_text.rename(columns={'aspect_name': 'aspect_id'}, inplace=True) reviews_without_text = reviews_without_text[[ 'id', 'product_id', 'review_id', 'aspect_id', 'sentiment_text' ]] time.sleep(1) review_text = pd.read_sql( "select review_id, review_text from source_reviews " "where file_id={};".format(file_id), con=db) reviews_with_text = reviews_without_text.merge(review_text, on=['review_id'], how='left') reviews_with_text = pd.DataFrame( list(reviews_with_text.apply(df_unicode, axis=1))) merged_text_dicts = {} for row in reviews_with_text.to_dict('records'): source_review_id = row['review_id'] aspect = row['aspect_id'] if source_review_id not in merged_text_dicts: merged_text_dicts[source_review_id] = {} if aspect not in merged_text_dicts[source_review_id]: merged_text_dicts[source_review_id][aspect] = [] merged_text_dicts[source_review_id][aspect].append(row) merged_text_dicts = merged_text_dicts.values() del reviews_with_text outfile_path = file_name.split('.')[0] + 'output.csv' run_parallel(merged_text_dicts, outfile_path) out_reviews = pd.read_csv(outfile_path, quoting=csv.QUOTE_ALL, delimiter=',') # No Snippet Found index_no_sentiment_text = out_reviews[['start_index', 'end_index']].apply( lambda row: True if row['start_index'] == -1 or row['end_index'] == -1 else False, axis=1) errors_snippet_not_found = out_reviews[index_no_sentiment_text] if level == 1 and not errors_snippet_not_found.empty: cant_find_snip_ids = ', '.join( map(str, map(int, list(errors_snippet_not_found.id.unique())))) db.execute( "update {} set validate_status='N', validate_comment='Not found in review' " "where id in ({});".format(snippet_table, cant_find_snip_ids)) out_reviews = out_reviews[~index_no_sentiment_text] merged_reviews = out_reviews[out_reviews.overlap == 1] if level == 1 and not merged_reviews.empty: merged_ids = ', '.join( map(str, map(int, list(merged_reviews.id.unique())))) db.execute( "update {} set validate_status='N', validate_comment='Overlapping' " "where id in ({});".format(snippet_table, merged_ids)) for row in out_reviews.to_dict('records'): rec_id = row['id'] start_index = row['start_index'] end_index = row['end_index'] sentiment_text = row['sentiment_text'] query = text("update {} set start_index=:sindx, end_index=:eindx, sentiment_text=:stext " \ "where id=:rid".format(snippet_table)) db.execute( query, { 'sindx': start_index, 'eindx': end_index, 'stext': sentiment_text, 'rid': rec_id }) try: os.remove(outfile_path) except: pass db.dispose() try: os.remove(outfile_path) except: pass return True
def output_to_db(): # Parsing parameters from the request args = dict(request.args) file_url = args.get('file', [None])[0] req_ip = request.remote_addr if file_url: # download the file from the url file_name = download_file(file_url) # Get column names of the target table from db table_cols = get_columns('sentiment_ouput') # Check the required columns present in the file or not missing_cols = file_validation_checks.check_file_header(file_name, table_cols, sep=',') if not missing_cols: # Uploading final files to DB file_creation = file_upload(file_name, 'sentiment_ouput', sep=',') # if errors in review send alert to internal mail group if file_creation['errors']: msg = {'message': file_creation} app.logger.error(str(file_creation['errors'])) send_mail( alert_mails, "Error in generating final output file for file {}".format( file_name), "Error: {}".format(file_creation['errors'])) return return_err_response(msg, 298) file_status = file_creation['data'] file_id = file_status.get('id') file_token = file_status.get('token') # Start index generation and duplicate snippet merging duplicates_merge.duplicate_merge(file_id, 2) db = get_engine() # create the final output file for s3 upload file_path = create_final_output_file(file_id, db) if 'errors' in file_path: db.dispose() msg = {'message': {'errors': 'Error in final file creation!'}} send_mail( alert_mails, "Error in generating final output file for file {}".format( file_name), "Error: {}".format(file_path['errors'])) return return_err_response(msg, 298) db.dispose() file_path = file_path['file'] # Upload the final output to s3 final_file_url = upload_to_s3(file_path) # Update the s3 file url in db update_column(file_id=file_id, file_url=final_file_url) # Call the Complete analysis api to send an alert to the customer try: requests.get( "https://{}:8443/completeAnalysis?fileToken={}".format( str(req_ip), file_token), timeout=10, verify=False) except Exception as apiex: app.logger.error( "Failed to call the process completion email api!\n{}". format(str(apiex))) return return_err_response({'message': file_creation}, 200) # For missing columns else: msg = { 'message': { "errors": "Missing columns in the file!" "Few columns are missing in the file! {}".format( ','.join(missing_cols)) } } return return_err_response(msg, 298) # If file url is missing in the request parameters else: msg = { 'message': { "errors": "Missing file_url in the parameter list!" } } return return_err_response(msg, 298)
def run_smartpulse(): args = dict(request.args) file_id = args.get('file_id', [None])[0] category_id = int(args.get('category_id', [0])[0]) category = args.get('category', [None])[0] file_url = args.get('file', [None])[0] classification = "sentiment" account_id = int(args.get('account_id', [0])[0]) if all([file_id, category_id, account_id, classification]): file_name_db = get_file_name(file_id) # download the file from the url file_name = download_file(file_url) colmap = get_colmap(account_id) if not colmap: send_mail( alert_mails, "Error processing file {}! File configuration is missing " "for account id {}".format(file_name_db, account_id), "Column mapping is missing for account id {}!".format( account_id)) msg = { 'message': { "errors": "Column mapping is missing for account id {}! ".format( account_id) } } return return_err_response(msg, 299) # format the file as required for smartpulse module check_point = file_validation_checks.order_columns( file_name, [ 'source', 'source_review_id', 'source_product_id', 'product_name', 'review_date', 'star_rating', 'verified_user', 'reviewer_name', 'review_url', 'review_tag', 'review_text' ], colmap) if check_point['errors']: app.logger.error(str(check_point['errors'])) msg = {'message': check_point} return return_err_response(msg, 299) time.sleep(10) # Check the file_id existance in file_uploads file_exist = check_record('file_uploads', 'id', file_id) if not file_exist: send_mail( alert_mails, "Error occured while uploading reviews to database for file id {}!" .format(file_id), "Invalid file id {}. File id is missing in the file_uploads table!" .format(file_id)) msg = { 'message': { "errors": "No record for file_id {} in database! ".format(file_id) } } return return_err_response(msg, 299) # Todo: Create a function to extract the lexicon data from db and create config dir with that data :completed # Creating config for smartpulse generation root_dir = os.path.join(BASE_DIR, 'smaartpulse/lexicons_db') final_dir = os.path.join(BASE_DIR, 'downloads') db = get_engine() pulse_config = LexiconMetadata(root_dir, file_id, account_id, category_id, db, file_name, final_dir, classification) db.dispose() if not pulse_config.status: error_msg = pulse_config.status_msg send_mail( alert_mails, "Lexicon config error for category {} with id {}.".format( pulse_config.category, category_id), error_msg) return return_err_response({'errors': error_msg}, 299) # Start uploading the reviews to database try: review_uploader(pulse_config.input_file, 'source_reviews', file_id, category_id) except sqlalchemy.exc.IntegrityError as err: trace = traceback.format_exc() app.logger.error(str(trace)) if "Duplicate entry" in str(err): msg = { 'message': { "errors": "Some error has occured while uploading reviews to database! " "Duplicate entry for the table Source Reviews." } } send_mail( alert_mails, "Error occured while uploading reviews to database for file {} with " "id {}!".format(file_name_db, file_id), "Duplicate entry for the table Source Reviews.") return return_err_response(msg, 299) except Exception as Ex: trace = traceback.format_exc() app.logger.error(str(trace)) send_mail( alert_mails, "Error occured while uploading reviews to database for file {} with " "id {}!".format(file_name_db, file_id), "There was error occured while uploading reviews!") msg = { 'message': { "errors": "Some error has occured while uploading reviews to database! " "Sent an internal alert for notification." } } return return_err_response(msg, 299) # Trigger the smartpulse module smartpulse.trigger_smartpulse(pulse_config) status = pulse_config.output result = status.get('status', None) if result: return upload_output(pulse_config, file_id=file_id) else: app.logger.error(str(status['error_stack'])) send_mail( alert_mails, "Error in generating smartpulse for file {} with id {}".format( file_name, file_id), str('\n'.join(status['errors'])).replace('\\n', '<br/>') + "<br/><br/><b>Error Stack:</b>" + str(status['error_stack']).replace('\\n', '<br/>')) return return_err_response(status, 299) else: missing_parameters = [] for param, value in dict( zip(['file_id', 'category_id', 'accout_id', 'classification'], [file_id, category_id, account_id, classification ])).items(): if not value: missing_parameters.append(param) return parameter_missing_error(missing_parameters)
def upload_output(pulse_config, file_id): result_dict = pulse_config.output # Read the treemap file from the result of smartpulse module treemap_file = result_dict['output_file'] print "Upload output ", treemap_file # Verify the file structure and data whether it is as required checks = file_validation_checks.file_validator(outputfile=treemap_file) file_name = get_file_name(file_id) # If errors sendout a mail to the internal group if checks['errors']: # Output file validation error send mail to ml-team app.logger.error(str(checks['errors'])) msg = {'message': checks} send_mail( alert_mails, "Error in generating smaartpulse for file {} (id {})".format( file_name, file_id), str('\n'.join(checks['errors'])).replace('\n', '<br/>')) return return_err_response(msg, 299) else: # if checks['errors'] and False: # send_mail(['*****@*****.**'], # "Error in generating smartpulse for file id {}".format(file_id), # str('\n'.join(checks['errors'])).replace('\n', '<br/>')) try: # Upload snippets to database upload = sentiment_upload(treemap_file, 'sentiment_output_temp', file_id) if upload: # Start validating the snippets # External lexicon validation implemented in backend lexicon_validation(file_id) # Update the not required snippets status to 'N' db = get_engine() db.execute( "update sentiment_output_temp set validate_status='N', validate_comment='Aspect not required' " "where aspect_id not in (select aspect_id from client_category_aspect " "where (account_id, category_id) in (select c.account_id, f.category_id " "from file_uploads f join customer c on c.id=f.customer_id where f.id={})) " "and file_id={}".format(file_id, file_id)) time.sleep(0.5) db.execute( "update sentiment_output_temp set validate_status='N', validate_comment='Polarity not required' " "where file_id={} " "and LOWER(trim(sentiment_type)) not in ('positive','negative');" .format(file_id)) # Close the db connection db.dispose() app.index_merge_queue.append((file_id, 'index', 1)) merged_files = app.index_merge_queue_output while file_id not in merged_files: time.sleep(60) merged_files.remove(file_id) print "got merged file for nlp" gnlp = google_nlp_validation(file_id, file_name) if gnlp: file_for_validation(file_id) else: return return_err_response( "Error in google nlp validation", 299) try: shutil.rmtree(pulse_config.client_dir) except: print "error in deleting client lexicon config dir" return return_response("successfully generated snippets.", 200) except Exception as Ex: trace = traceback.format_exc() app.logger.error(str(trace)) msg = {'message': {'errors': [str(Ex)], 'error_stack': str(trace)}} send_mail( alert_mails, "Error in uploading smartpulse to db for file {} with id {}". format(file_name, file_id), str(Ex).replace('\\n', '<br/>') + "<br/><br/><b>Error Stack:</b>" + str(trace).replace('\\n', '<br/>')) return return_err_response(msg, 299)
def google_nlp(file_id, output_path='downloads/google'): output_file = os.path.join(output_path, '{}_google_nlp.csv'.format(file_id)) cluster_size = 1000 db = get_engine() reviews = pd.read_sql( "select product_id, review_id, review_text from source_reviews " "where file_id={} and (file_id, review_id) in " "(select distinct file_id, review_id from sentiment_output_temp " "where file_id={} and validate_status='Y');".format(file_id, file_id), con=db) reviews = reviews[['product_id', 'review_id', 'review_text' ]].drop_duplicates(['review_id', 'review_text']) reviews = pd.DataFrame(list(reviews.apply(df_unicode, axis=1))) if reviews.empty: raise (ValueError, "No Reviews found for file id {}".format(file_id)) reviews.review_text = reviews.review_text.apply(punctuation_correction) try: previous_run = pd.read_csv(output_file, delimiter=',', quotechar='"', names=[ 'product_id', 'review_id', 'sentence', 'sentiment_score', 'sentiment_magnitude' ]) reviews = reviews[~reviews.review_id. isin(set(list(previous_run.review_id)))] del previous_run except: pass print "Review Count is ", reviews.shape[0] if not reviews.empty: credentials_list = get_creds() manager = mp.Manager() review_queue = manager.Queue() for product_id in set(list(reviews.product_id)): review_texts = reviews[reviews.product_id == product_id].to_dict( 'records') row_count = len(review_texts) clusters = (row_count / cluster_size) if ( row_count % cluster_size == 0) else (row_count / cluster_size + 1) for cl in range(clusters): cluster_range1 = cl * cluster_size cluster_range2 = min([(cl + 1) * cluster_size, row_count]) cluster = review_texts[cluster_range1:cluster_range2] review_queue.put(cluster) output_queue = manager.Queue() # Start all the scraping processes write_process = mp.Process(target=to_file, args=(output_queue, 1, output_file)) write_process.start() gcloud_client(review_queue, output_queue, credentials_list, direct=True) # Join all the scraping processes after completion write_process.join() write_process.terminate() logger.info( "Completed Extracting google nlp snippets for file_id {}".format( file_id)) nlp_snippets = pd.read_csv(output_file, sep=',', quotechar='"', quoting=csv.QUOTE_ALL, names=[ 'product_id', 'review_id', 'sentence', 'sentiment_score', 'sentiment_magnitude' ]).drop_duplicates() nlp_snippets = nlp_snippets[~(nlp_snippets.sentence.isnull())] sent = [] for row in nlp_snippets.to_dict("records"): if float(row["sentiment_score"]) > 0.0: row["sentiment"] = "positive" elif float(row["sentiment_score"]) < -0.25: row["sentiment"] = "negative" else: row["sentiment"] = "neutral" sent.append(row) nlp_snippets = pd.DataFrame.from_dict(sent) del sent enx_snippets = pd.read_sql( "select id, review_id, sentiment_text, sentiment_type, confidence_score " "from sentiment_output_temp " "where file_id={} and validate_status='Y';".format(file_id), con=db) review_ids = list(enx_snippets.review_id.unique()) enx_snippets['sent_text_cleaned'] = enx_snippets.sentiment_text.apply( lambda x: ''.join([y for y in x.lower() if y.isalnum()])) enx_snippets_dict = dict() for row in enx_snippets.to_dict('records'): review_id = row['review_id'] if not review_id in enx_snippets_dict: enx_snippets_dict[review_id] = list() enx_snippets_dict[review_id].append(row) del enx_snippets nlp_snippets['sent_text_cleaned'] = nlp_snippets.sentence.apply( lambda x: ''.join([y for y in x.lower() if y.isalnum()])) nlp_snippets_dict = dict() for row in nlp_snippets.to_dict('records'): review_id = row['review_id'] if not review_id in nlp_snippets_dict: nlp_snippets_dict[review_id] = list() nlp_snippets_dict[review_id].append(row) del nlp_snippets match = [] non_match = [] print "Started validation against google nlp output!" for review_id in review_ids: reviews_google = nlp_snippets_dict[review_id] reviews_enixta = enx_snippets_dict[review_id] for row1 in reviews_enixta: row_id = row1['id'] found = False for row2 in reviews_google: if row1['sent_text_cleaned'] in row2['sent_text_cleaned'] \ or row2['sent_text_cleaned'] in row1['sent_text_cleaned']: found = True row1['sentiment_type'] = row1['sentiment_type'].strip() row1['sentiment_type'] = row1['sentiment_type'].replace( 'most-positive', 'positive') row1['sentiment_type'] = row1['sentiment_type'].replace( 'most-negative', 'negative') if row2['sentiment'] == row1['sentiment_type'] or row1[ 'confidence_score'] > 0.95: match.append(row_id) else: non_match.append(row_id) break if not found: if row1['confidence_score'] > 0.95: match.append(row_id) else: non_match.append(row_id) non_match_ids = ', '.join(map(str, non_match)) print "completed google nlp comparision" if non_match_ids: query = "update sentiment_output_temp set validate_status='N', validate_comment='Failed Google NLP Validation' " \ "where id in ({})".format(non_match_ids) db.execute(query) db.dispose() return True