def getLessons(credentials): """Description: Function to query all lessons data from elasticsearch Returns: dataframe with all the lessons data Usage: >>> import ElasticFunctions as ef >>> df = ef.getLessons(credentials) """ es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600) doc = { "query": { "term": { "isLesson": { "value": True, "boost": 1.0 } } } } lessonsDF = pd.DataFrame() data = es.search(index="sentences", body=doc, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) while scrollSize > 0: if lessonsDF.empty: lessonsDF = Select.from_dict(data).to_pandas() else: lessonsDF = lessonsDF.append(Select.from_dict(data).to_pandas()) data = es.scroll(scroll_id = scrollId, scroll = '1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) return lessonsDF
def getBaseClassification(credentials): """Function to query all base classification data from elasticsearch Returns: dataframe with all the base classification data Usage: >>> import ElasticFunctions as ef >>> df = ef.getLessons(credentials) """ es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600) doc = { 'size' : 10000, 'query': { 'match_all' : {} } } baseClassificationDF = pd.DataFrame() data = es.search(index="base-classification", body=doc, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) while scrollSize > 0: if baseClassificationDF.empty: baseClassificationDF = Select.from_dict(data).to_pandas() else: baseClassificationDF = baseClassificationDF.append(Select.from_dict(data).to_pandas()) data = es.scroll(scroll_id = scrollId, scroll = '1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) return baseClassificationDF
def get_df_ch(rut): eco = ecomart.Ecomart() li_docs = eco.query_li_rut(rut) try: lista_i3_identity = [] df_li = Select.from_dict(li_docs).to_pandas() df_li = df_li[['i3_identity', 'canal_id', 'table_name', 'dbname']] lista_i3_identity = df_li['i3_identity'].unique() except: pass if len(lista_i3_identity): df_ch = [] for i3 in lista_i3_identity: ch_docs = eco.query_ch_i3(i3) if len(ch_docs) > 0: try: ch = Select.from_dict(ch_docs).to_pandas() ch2 = ch[[ 'i3_identity', 'canal_id', 'callconnectedtimeUTC', 'wcode_displayname', 'wcat_displayname' ]] df_ch.append(ch2) except: pass df_all = pd.concat(df_ch) return df_all
def inin(): rut = deu.get_rut() try: if len(rut) > 1: pass except: return redirect(url_for("form_rut")) eco = ecomart.Ecomart() li_docs = eco.query_li_rut(rut) try: lista_i3_identity = [] #li = li_docs['hits']['hits'][0]['_source'] #df = json2html.convert(json = li) df_li = Select.from_dict(li_docs).to_pandas() #df_li.drop(columns=['_index', '_type', '_id', '_score'], inplace=True, errors='ingnore') df_li = df_li[['canal_id', 'table_name', 'dbname', 'i3_identity']] lista_i3_identity = df_li['i3_identity'].unique() except: pass if len(lista_i3_identity): df_ch = [] for i3 in lista_i3_identity: ch_docs = eco.query_ch_i3(i3) if len(ch_docs) > 0: try: ch = Select.from_dict(ch_docs).to_pandas() ch2 = ch[[ 'campaignname', 'canal_id', 'callconnectedtimeUTC', 'wcode_displayname', 'wcat_displayname', 'length', 'i3_identity' ]] df_ch.append(ch2) except: pass df_all = pd.concat(df_ch) else: return 'sin datos' found, datos = get_basic_data(rut) if found: df_all.sort_values(by=['callconnectedtimeUTC'], inplace=True) # Genera chart de "tipo" timeline #chartt.gen_timeline(df_all) #tl_chart = chartt.gen_timeline2(df_all) graficar = 0 if graficar == 1: chartt.chart_wcode_x_date(df_all) return render_template('inin.html', rut=rut, ap_pat=datos['apellido_paterno'], ap_mat=datos['apellido_materno'], primer_nombre=datos['primer_nombre'], li=df_li, ch=df_all, graficar=graficar)
def process_hits(hits): for item in hits: # Process hits here a = Select.from_dict(data).to_pandas() print(a.shape) print(list(a.columns.values)) return(a)
def index_data_to_df( self, index_name: str, start_time: str, end_time: str, field_list: str, ): query = {"range": {"@timestamp": {"gte": start_time, "lte": end_time}}} sort = {"@timestamp": {"order": "asc"}, "_id": {"order": "asc"}} max_size = 5000 search_after = [0, 0] pandas_df = pd.DataFrame() while True: result_dict = self._es_query(index_name, query, sort, search_after, max_size) if len(result_dict['hits']['hits']) == 0: break tmp = Select.from_dict(result_dict).to_pandas() pandas_df = pandas_df.append(tmp[field_list]) search_after = [ self._unix_time_millis(pandas_df['@timestamp'].iloc[-1]), pandas_df['_id'].iloc[-1] ] pandas_df.to_csv('input_ryu.csv') return pandas_df
def getResultScrolling(self, partRes, partialSize=None): df = None scrollId = partRes['_scroll_id'] # Do you want the whole index or a partial data ? scrollSize = partRes['hits']['total']['value'] if partialSize: if partialSize <= scrollSize: scrollSize = partialSize while scrollSize > 0: #print(scrollSize) pandas_df = Select.from_dict(partRes).to_pandas() chunkSize = len(partRes['hits']['hits']) if df is None: df = pandas_df else: df = df.append(pandas_df, ignore_index=True) # arrayHits = partRes['hits']['hits'] # for hit in arrayHits: # for key, val in hit["_source"].items(): # try: # fields[key] = np.append(fields[key], val) # except KeyError: # fields[key] = np.array([val]) scrollSize = scrollSize - chunkSize partRes = self.scroll(scroll_id=scrollId) df.drop(columns=["_index", "_type", "_id", "_score"], axis=1, inplace=True) return df
def comportamiento(): rut = deu.get_rut() try: if len(rut) > 1: pass except: return redirect(url_for("form_rut")) eco = ecomart.Ecomart() lu_docs = eco.query_lu_rut(rut) try: df = Select.from_dict(lu_docs).to_pandas() df.drop(columns=['_index', '_type', '_id', '_score'], inplace=True, errors='ingnore') df_lu = df[[ 'fecha_generacion', 'lista_cliente', 'motivo_no_pago', 'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real', 'saldo_castigado', 'sucursal_cobranza', 'banca_cliente' ]] df_lu.sort_values(by=['fecha_generacion'], inplace=True) df_ch = get_df_ch(rut) chartt.chart_comp(df_lu, df_ch) chartt.chart_wcode_x_date(df_ch) return render_template('comportamiento.html', rut=rut, dv=df['dv'][0], ap_pat=df['apellido_paterno'][0], ap_mat=df['apellido_materno'][0], primer_nombre=df['primer_nombre'][0], lu=df_lu) except Exception as e: return 'sin datos.' + str(e)
def deudor_head(): rut = deu.get_rut() try: if len(rut) > 1: pass except: return redirect(url_for("form_rut")) eco = ecomart.Ecomart() lu_docs = eco.query_lu_rut(rut) try: #lu = lu_docs['hits']['hits'][0]['_source'] #df = json2html.convert(json = lu) df = Select.from_dict(lu_docs).to_pandas() df.drop(columns=['_index', '_type', '_id', '_score'], inplace=True, errors='ingnore') df_lu = df[[ 'fecha_generacion', 'lista_cliente', 'motivo_no_pago', 'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real' ]] df_lu.sort_values(by=['fecha_generacion']) otros_datos = get_otros_datos(df) return render_template('deudor.html', rut=rut, dv=df['dv'][0], ap_pat=df['apellido_paterno'][0], ap_mat=df['apellido_materno'][0], primer_nombre=df['primer_nombre'][0], otros_datos=otros_datos) except Exception as e: return 'sin data.' + str(e)
def getLessons(credentials): es = Elasticsearch([ 'http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"] ], timeout=600) doc = {"query": {"term": {"isLesson": {"value": True, "boost": 1.0}}}} lessonsDF = pd.DataFrame() data = es.search(index="sentences", body=doc, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) while scrollSize > 0: if lessonsDF.empty: lessonsDF = Select.from_dict(data).to_pandas() else: lessonsDF = lessonsDF.append(Select.from_dict(data).to_pandas()) data = es.scroll(scroll_id=scrollId, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) return lessonsDF
def getTopics(credentials): es = Elasticsearch([ 'http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"] ], timeout=600) doc = {'size': 10000, 'query': {'match_all': {}}} topicsDF = pd.DataFrame() data = es.search(index="topics", body=doc, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) while scrollSize > 0: if topicsDF.empty: topicsDF = Select.from_dict(data).to_pandas() else: topicsDF = topicsDF.append(Select.from_dict(data).to_pandas()) data = es.scroll(scroll_id=scrollId, scroll='1m') scrollId = data['_scroll_id'] scrollSize = len(data['hits']['hits']) return topicsDF
def get_tweets(self, index_search): index_aux = "tweets-sentiment_" + index_search df1 = DataFrame.from_es(url="http://192.168.127.129:9200", index=index_aux, doc_type="new-tweet", compat=6) es = Elasticsearch('http://192.168.127.129:9200') results = es.search(index=index_aux, size=5600, body={"query": { "match_all": {} }}) df = Select.from_dict(results).to_pandas() return df
def ajax2(): print('ajax2') rut = request.form.get('rut') print('rut:', rut, flush=True) print('Buscando para RUT:', rut, flush=True) mensaje = '' resultado = 0 try: eco = ecomart.Ecomart() lu_docs = eco.query_lu_rut(rut) df = Select.from_dict(lu_docs).to_pandas() if (len(df.index) > 0): resultado = 1 deu.set_rut(rut) except: mensaje = 'No encontre el rut consultado.' try: ap_pat = '' ap_pat = df['apellido_paterno'][0] except: ap_pat = 'Apellido Paterno, sin datos' try: ap_mat = '' ap_mat = df['apellido_materno'][0] except: ap_mat = 'Apellido Materno, sin datos' try: primer_nombre = '' primer_nombre = df['primer_nombre'][0] except: primer_nombre = 'Nombre, Sin datos' respuesta = { 'resultado': resultado, 'mensaje': mensaje, 'rut': rut, 'nombre': primer_nombre, 'apellido_paterno': ap_pat, 'apellido_materno': ap_mat, 'edad': 'ND' } return jsonify(respuesta)
def get_basic_data(rut): found = False try: print('buscando en ecomart con RUT : ' + str(rut)) eco = ecomart.Ecomart() lu_docs = eco.query_lu_rut(rut) df = Select.from_dict(lu_docs).to_pandas() print('largo df:', len(df.index), flush=True) if (len(df.index) > 0): deu.set_rut(rut) found = True print('encontro.') try: ap_pat = '' ap_pat = df['apellido_paterno'][0] except: ap_pat = 'Sin datos' try: ap_mat = '' ap_mat = df['apellido_materno'][0] except: ap_mat = 'Sin datos' try: primer_nombre = '' primer_nombre = df['primer_nombre'][0] except: primer_nombre = 'Sin datos' basic_data = { 'apellido_paterno': ap_pat, 'apellido_materno': ap_mat, 'primer_nombre': primer_nombre, } else: basic_data = { 'apellido_paterno': 'Sin datos', 'apellido_materno': 'Sin datos', 'primer_nombre': 'Sin datos' } except: basic_data = { 'apellido_paterno': 'Sin datos', 'apellido_materno': 'Sin datos', 'primer_nombre': 'Sin datos' } return [found, basic_data]
def getProjectDetails(credentials, projectNumber): """Function to get PCR data using projectNumber Returns: dataframe with PCR data of a projectNumber """ es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600) doc = { "query": { "term": { "projectNumber": { "value": projectNumber, "boost": 1.0 } } } } data = es.search(index="pcrs", body=doc) projectDF = Select.from_dict(data).to_pandas() return projectDF
def setIsExtracted(fileName): es = Elasticsearch([ 'http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"] ], timeout=600) doc = {"query": {"match_phrase": {"fileName": fileName}}} data = es.search(index="pcrs", body=doc) df = Select.from_dict(data).to_pandas() action = [{ "_index": "pcrs", "_id": row["_id"], "_source": { "projectNumber": row["projectNumber"], "isExtracted": False, "tentative": row["tentative"], "title": row["title"], "sectors": row["sectors"], "countries": row["countries"], "themes": row["themes"], "downloadLink": row["downloadLink"], "fileName": row["fileName"], "monthYear": row["monthYear"], "month": row["month"], "year": row["year"], "milestoneApprovalDate": row["milestoneApprovalDate"], "milestoneEffectivityDate": row["milestoneEffectivityDate"], "milestoneSigningDate": row["milestoneSigningDate"], "safeguardCategories": row["safeguardCategories"], "sourceOfFunding": row["sourceOfFunding"], "modalitiesFromWebsite": row["modalitiesFromWebsite"], "modalitiesFromDump": row["modalitiesFromDump"], "uniqueModalitiesFromDump": row["uniqueModalitiesFromDump"] } } for index, row in df.iterrows()] es = Elasticsearch([ 'http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"] ], timeout=600) helpers.bulk(es, action)
def generate_es_resultset_file(self, request_id, output_file): initial_result_set = settings.ES.search( index=settings.ES_COMPUTED_RESULT_INDEX, body=self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY ) # to get size parameter resultset_size = initial_result_set['hits']['total'] self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY['size'] = resultset_size result_set = settings.ES.search( index=settings.ES_COMPUTED_RESULT_INDEX, body=self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY) downloadable_df = Select.from_dict(result_set).to_pandas() try: comp_logger.info( "Generating dataframe from ES resultset for id: {}".format( request_id)) downloadable_df = downloadable_df[settings.DOWNLOAD_HEADERS] except: # if there is no human verdict match\ comp_logger.info( "ES resultset empty for human_verdict match for id: {} is empty" .format(request_id)) downloadable_df = pd.DataFrame(columns=settings.DOWNLOAD_HEADERS) if not os.path.exists(output_file): comp_logger.info( "ES human_verdict match resultset initiated for id: {}".format( request_id)) downloadable_df.to_csv(output_file, sep='\t', index=False, encoding='iso-8859-1') else: comp_logger.info( "ES human_verdict match resultset already generated for id: {}" .format(request_id))
def lu(): rut = deu.get_rut() try: if len(rut) > 1: pass except: return redirect(url_for("form_rut")) eco = ecomart.Ecomart() lu_docs = eco.query_lu_rut(rut) try: #lu = lu_docs['hits']['hits'][0]['_source'] #df = json2html.convert(json = lu) df = Select.from_dict(lu_docs).to_pandas() df.drop(columns=['_index', '_type', '_id', '_score'], inplace=True, errors='ingnore') df_lu = df[[ 'fecha_generacion', 'lista_cliente', 'motivo_no_pago', 'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real', 'saldo_castigado', 'sucursal_cobranza', 'banca_cliente' ]] df_lu.sort_values(by=['fecha_generacion'], inplace=True) graficar = 0 if graficar == 1: chartt.chart_lu_x_date(df_lu) return render_template('lu.html', rut=rut, dv=df['dv'][0], ap_pat=df['apellido_paterno'][0], ap_mat=df['apellido_materno'][0], primer_nombre=df['primer_nombre'][0], lu=df_lu, graficar=graficar) except Exception as e: return 'sin datos. ' + str(e)
def predict(index): data = csv_reader(index_name=index, size=10000) # print(json.dumps(data, indent=4)) df = Select.from_dict(data).to_pandas() df["Time"] = df["Time"].str.replace("T", " ").str[0:19] df_col_time = df["Time"].str.split(' ', expand=True) df["Date"] = df_col_time[0] df["Date_Time"] = df_col_time[1] df["Value_comb"] = '0' df.loc[pd.isnull(df["Value"]), "Value_comb"] = '0' df.loc[(df["Value"] > '0') & (df["Value"] < '0.000001'), "Value_comb"] = '1' df.loc[(df["Value"] >= '0.000001') & (df["Value"] < '0.00001'), "Value_comb"] = '2' df.loc[(df["Value"] >= '0.00001') & (df["Value"] < '0.0001'), "Value_comb"] = '3' df.loc[(df["Value"] >= '0.0001') & (df["Value"] < '0.001'), "Value_comb"] = '4' df.loc[(df["Value"] >= '0.001') & (df["Value"] < '0.01'), "Value_comb"] = '5' df.loc[(df["Value"] >= '0.01') & (df["Value"] < '0.1'), "Value_comb"] = '6' df.loc[(df["Value"] >= '0.1') & (df["Value"] < '1'), "Value_comb"] = '7' df.loc[(df["Value"] >= '1') & (df["Value"] < '10'), "Value_comb"] = '8' df.loc[(df["Value"] >= '10') & (df["Value"] < '100'), "Value_comb"] = '9' df.loc[(df["Value"] >= '100') & (df["Value"] < '1000'), "Value_comb"] = '10' df.loc[(df["Value"] >= '1000') & (df["Value"] < '10000'), "Value_comb"] = '11' df.loc[(df["Value"] >= '10000') & (df["Value"] < '100000'), "Value_comb"] = '12' df.loc[(df["Value"] >= '100000') & (df["Value"] < '1000000'), "Value_comb"] = '13' df.loc[(df["Value"] >= '1000000') & (df["Value"] < '10000000'), "Value_comb"] = '14' df.loc[(df["Value"] >= '10000000') & (df["Value"] < '100000000'), "Value_comb"] = '15' df.loc[(df["Value"] >= '100000000') & (df["Value"] < '1000000000'), "Value_comb"] = '16' df.loc[(df["Value"] >= '1000000000') & pd.notnull(df["Value"]), "Value_comb"] = '17' df["Value_comb"] = df["Value_comb"].astype(float) # data["Time"] = pd.to_datetime(data["Time"]) df["Num_Date_Time"] = df["Date_Time"].str.replace(":", ".").str[0:5] # df["Num_Date_Time"] = df["Num_Date_Time"].str[3:8] df["Num_Date_Time"] = df["Num_Date_Time"].astype(float) new_data = df[["Num_Date_Time", "Value_comb"]].copy() indexnewData = new_data.set_index(["Num_Date_Time"]) rollmean = indexnewData.rolling(window=13).mean() rollstd = indexnewData.rolling(window=13).std() new_data.to_csv('export_dataframe.csv') series = read_csv('export_dataframe.csv', header=0, index_col=0) series.set_index(['Num_Date_Time'], inplace=True) # create a difference transform of the dataset def difference(dataset): diff = list() for i in range(1, len(dataset)): value = dataset[i] - dataset[i - 1] diff.append(value) return numpy.array(diff) # Make a prediction give regression coefficients and lag obs def predict(coef, history): yhat = coef[0] for i in range(1, len(coef)): yhat += coef[i] * history[-i] return yhat # split dataset X = difference(series.values) size = int(len(X) * 0.66) train, test = X[0:size], X[size:] # train autoregression window_size = 6 model = AR(train) model_fit = model.fit(maxlag=window_size, disp=False) # save model to file # model_fit.save('ar_model.pkl') # save model using pickle pickle.dump(model_fit, open('pickle_model.pkl', 'wb')) # save the differenced dataset numpy.save('ar_data.npy', X) # save the last observation numpy.save('ar_obs.npy', [series.values[-1]]) # save coefficients coef = model_fit.params numpy.save('man_model.npy', coef) # save lag lag = X[-window_size:] numpy.save('man_data.npy', lag) window = model_fit.k_ar coef = model_fit.params # walk forward over time steps in test history = [train[i] for i in range(len(train))] predictions = list() for t in range(len(test)): yhat = predict(coef, history) obs = test[t] predictions.append(yhat) history.append(obs) error = mean_squared_error(test, predictions) print('Test MSE: %.3f' % error) # load the AR model from file model = ARResults.load('pickle_model.pkl') # print(loaded.params) data = numpy.load('ar_data.npy') last_ob = numpy.load('ar_obs.npy') print(last_ob) coef = numpy.load('man_model.npy') print(coef) lag = numpy.load('man_data.npy') print(lag) # make prediction # prediction = predict(coef, lag) prediction = predict(coef, lag) # transform prediction yhat = prediction + last_ob[0] print('Prediction: %f' % yhat) x_predict = [] y_predict = [] x_test = [] y_test = [] for data in predictions: point = data.shape if len(point) == 1: x_predict.append(point[0]) y_predict.append(None) else: x_predict.append(point[0]) y_predict.append(point[1]) for data in test: point = data.shape if len(point) == 1: x_test.append(point[0]) y_test.append(None) else: x_test.append(point[0]) y_test.append(point[1]) # result = ((x_predict, y_predict), (x_test, y_test)) result = (predictions, test) return result
from collections import Counter import seaborn as sns from pandasticsearch import Select from operator import itemgetter from elasticsearch import Elasticsearch """Import Data drom ELasticsearch""" es = Elasticsearch(["localhost:9200"]) order = es.search(index='commandes',body={},size=1000) customer = es.search(index='custommer',body={},size=1000) order_df = Select.from_dict(order).to_pandas() customer_df = Select.from_dict(customer).to_pandas() """Prepare DataSet With GroupBy""" #order_df.groupby(order_df.version.apply(lambda x: x['major'])).size() """Essay 1""" def productReturn(): for i in order_df.orderItems: for j in i: return j['product'] productReturn() #cd = order_df.groupby('orderItems').apply(lambda x : x) """"""
def update_task(words, progress, updaterate): startid = progress['task'] if startid > 0: task_p = pd.read_json('data/pms_task.json') else: task_p = pd.DataFrame() while True: # sleep(1) result_dict = es.search(index="pms_task", body={ "query": { "constant_score": { "filter": { "range": { "id": { "gt": startid, "lt": startid + updaterate + 1 } } } } } }, size=10000) if len(result_dict['hits']['hits']) == 0: break task_new = Select.from_dict(result_dict).to_pandas()[[ 'name', 'id', 'desc' ]] # from nohtml import strip_tags # task_new['desc'] = task_new['desc'].apply(lambda x:strip_tags(x)) task_new = task_new.loc[task_new['id'].drop_duplicates().index, :] task_new = task_new.set_index(task_new['id'].values) #task_new['id'] = task_new['id'].astype(str) task_new['seg'] = '' for i in task_new['id'].values: print(i) seg = segmentation( (task_new['name'][i] + task_new['desc'][i]).split()) word_count = Counter(seg) print(word_count) wordn = sum(word_count.values()) for word in word_count: word_count[word] = word_count[word] / wordn print(word_count) words += Counter(word_count.keys()) task_new['seg'][i] = dict(word_count) task_p = pd.concat([task_p, task_new], axis=0) startid = np.sort(task_new['id'].values)[-1] progress['task'] = int(startid) task_p.to_json('data/seg/pms_task.json')
] } }, { "range": { "m_LogDate": { "gte": "2020-12-21T00:00:00.000Z", "lte": "2020-12-28" } } }, ] } } }) df = Select.from_dict(data).to_pandas() df.drop_duplicates(subset='m_To', keep='last', inplace=True) df.to_csv(directory + "blocksall1306maymta2.csv", index=False) df['Blocker'] = df.apply(lambda row: blocker(row), axis=1) to_dropcols = ['m_LogEntry', 'm_LogDate','_id', '_index', '_score', '_type',\ 'm_From','m_LogType','m_MessageId', 'm_SubmissionDate'] df.drop(to_dropcols, axis=1, inplace=True) print(df.shape) print(list(df.columns.values)) print(df) #print (data['hits'][1])10.csv" df.to_csv(directory + "blocks28decmta1.csv", index=False)
def update_story(words, progress, updaterate): startid = progress['story'] if startid > 0: story_p = pd.read_json('data/pms_story.json') else: story_p = pd.DataFrame() while True: #sleep(1) result_dict = es.search(index="pms_story", body={ "query": { "constant_score": { "filter": { "range": { "story": { "gt": startid, "lt": startid + updaterate + 1 } } } } } }, size=10000) if len(result_dict['hits']['hits']) == 0: break story_new = Select.from_dict(result_dict).to_pandas()[[ 'story', 'title', 'spec' ]] # sql = 'select story, title, spec from zt_storyspec where story > {0} limit {1}'.format(startid, updaterate) # story_new = pd.read_sql(sql, engine) # ifremain = (len(story_new) == updaterate) # print(len(story_new)) # # print(ifremain) # from nohtml import strip_tags # story_new['spec'] = story_new['spec'].apply(lambda x:strip_tags(x)) story_new = story_new.loc[ story_new['story'].drop_duplicates().index, :] story_new = story_new.set_index(story_new['story'].values) story_new['seg'] = '' for i in story_new['story'].values: seg = segmentation( (story_new['title'][i] + story_new['spec'][i]).split()) word_count = Counter(seg) print(i) print(word_count) wordn = sum(word_count.values()) for word in word_count: word_count[word] = word_count[word] / wordn #print(word_count) words += Counter(word_count.keys()) story_new['seg'][i] = dict(word_count) story_p = pd.concat([story_p, story_new], axis=0) startid = np.sort(story_new['story'].values)[-1] print(startid) progress['story'] = int(startid) story_p.to_json('data/seg/pms_story.json')
def update_bug(words, progress, updaterate): startid = progress['bug'] if startid > 0: bug_p = pd.read_json('data/pms_bug.json') else: bug_p = pd.DataFrame() while True: # sleep(1) result_dict = es.search(index="pms_bug", body={ "query": { "constant_score": { "filter": { "range": { "id": { "gt": startid, "lt": startid + updaterate + 1 } } } } } }, size=10000) if len(result_dict['hits']['hits']) == 0: break bug_new = Select.from_dict(result_dict).to_pandas()[[ 'id', 'title', 'steps' ]] bug_new = bug_new.loc[bug_new['id'].drop_duplicates().index, :] bug_new = bug_new.set_index(bug_new['id'].values) bug_new['seg'] = '' for i in bug_new['id'].values: #print(i) seg = segmentation( (bug_new['title'][i] + bug_new['steps'][i]).split()) word_count = Counter(seg) #print(word_count) wordn = sum(word_count.values()) for word in word_count: word_count[word] = word_count[word] / wordn #print(word_count) words += Counter(word_count.keys()) bug_new['seg'][i] = dict(word_count) bug_p = pd.concat([bug_p, bug_new], axis=0) startid = np.sort(bug_new['id'].values)[-1] print(startid) progress['bug'] = int(startid) bug_p.to_json('data/seg/pms_bug.json')
from elasticsearch import Elasticsearch from pandasticsearch import Select import pandas as pd from k_anonymity import * import util import os script_dir = os.path.dirname(__file__) '''get elastic search data as Python dict''' from elasticsearch import Elasticsearch es = Elasticsearch('http://localhost:9200') result_dict = es.search(index="kibana_sample_data_logs", body={"query": {"match_all": {}}},size=10000) df = Select.from_dict(result_dict).to_pandas() '''data cleaning''' df=util.explode(df,['tags']) for column in df.select_dtypes('object').columns: df[column]=df[column].astype('category') df['response']=df['response'].astype('category') print("Data type of columns:") print(df.dtypes) '''analyse data span''' full_spans = get_spans(df, df.index) print("Full span:") print(full_spans) '''choose column to be anonymized''' feature_columns = ['geo.coordinates.lat', 'geo.coordinates.lon'] sensitive_column = 'response'
def get_unique_sensor_ids_around_geo_location(geo_shape, filter_by_sensor_types=None): if filter_by_sensor_types is None: filter_by_sensor_types = [] size = 1000 search_query = { "size": size, "query": {"bool": {}}, "aggs": { "unique_sensor_ids": { "terms": { "field": "sensor_id" } } } } geo_data = { "geo_polygon": { "ignore_unmapped": True, "geo_location": { "points": geo_shape } } } search_query["query"]["bool"]["filter"] = geo_data if filter_by_sensor_types: search_query["query"]["bool"]["must"] = { "terms": { "sensor_type": filter_by_sensor_types } } # query the results and pass a param: scroll=1m response = es.search(index=es_index_name, doc_type=es_doc_type, body=search_query, params={'scroll': '1m'}) # get the scroll id scroll_id = response.get('_scroll_id') total_results = response['hits']['total'] scroll_size = total_results from pandasticsearch import Select df = Select.from_dict(response).to_pandas() results_fetched = size while scroll_size > 0: page = es.scroll(scroll_id=scroll_id, scroll='2m') # Update the scroll ID scroll_id = page.get('_scroll_id') # Get the number of results that we returned in the last scroll scroll_size = len(page['hits']['hits']) message = "Fetching {}/{} results ({}%)".format(results_fetched, total_results, round((results_fetched / total_results) * 100, 2)) print(message) # Do something with the obtained page df_page_next = Select.from_dict(page).to_pandas() df = pd.concat([df, df_page_next], ignore_index=True) results_fetched += scroll_size print("") # get the unique sensor_id df_sensor_ids = df['sensor_id'].unique() # sort the ids df_sensor_ids.sort() unique_sensor_ids = list(df_sensor_ids) return unique_sensor_ids