def getLessons(credentials):
    """Description: Function to query all lessons data from elasticsearch
    Returns: dataframe with all the lessons data
    Usage:
    >>> import ElasticFunctions as ef
    >>> df = ef.getLessons(credentials)
    """
    es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600)
    doc = {
        "query": {
            "term": {
                "isLesson": {
                    "value": True,
                    "boost": 1.0
                }
            }
        }
    }
    lessonsDF = pd.DataFrame()
    data = es.search(index="sentences", body=doc, scroll='1m')
    scrollId = data['_scroll_id']
    scrollSize = len(data['hits']['hits'])
    while scrollSize > 0:
        if lessonsDF.empty:
            lessonsDF = Select.from_dict(data).to_pandas()
        else:
            lessonsDF = lessonsDF.append(Select.from_dict(data).to_pandas())
        data = es.scroll(scroll_id = scrollId, scroll = '1m')
        scrollId = data['_scroll_id']
        scrollSize = len(data['hits']['hits'])
    return lessonsDF
def getBaseClassification(credentials):
    """Function to query all base classification data from elasticsearch
    Returns: dataframe with all the base classification data
    Usage:
    >>> import ElasticFunctions as ef
    >>> df = ef.getLessons(credentials)
    """
    es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600)
    doc = {
            'size' : 10000,
            'query': {
                'match_all' : {}
        }
    }
    baseClassificationDF = pd.DataFrame()
    data = es.search(index="base-classification", body=doc, scroll='1m')
    scrollId = data['_scroll_id']
    scrollSize = len(data['hits']['hits'])
    while scrollSize > 0:
        if baseClassificationDF.empty:
            baseClassificationDF = Select.from_dict(data).to_pandas()
        else:
            baseClassificationDF = baseClassificationDF.append(Select.from_dict(data).to_pandas())
        data = es.scroll(scroll_id = scrollId, scroll = '1m')
        scrollId = data['_scroll_id']
        scrollSize = len(data['hits']['hits'])
    return baseClassificationDF
Пример #3
0
def get_df_ch(rut):
    eco = ecomart.Ecomart()
    li_docs = eco.query_li_rut(rut)
    try:
        lista_i3_identity = []
        df_li = Select.from_dict(li_docs).to_pandas()
        df_li = df_li[['i3_identity', 'canal_id', 'table_name', 'dbname']]
        lista_i3_identity = df_li['i3_identity'].unique()
    except:
        pass

    if len(lista_i3_identity):
        df_ch = []
        for i3 in lista_i3_identity:
            ch_docs = eco.query_ch_i3(i3)
            if len(ch_docs) > 0:
                try:
                    ch = Select.from_dict(ch_docs).to_pandas()
                    ch2 = ch[[
                        'i3_identity', 'canal_id', 'callconnectedtimeUTC',
                        'wcode_displayname', 'wcat_displayname'
                    ]]
                    df_ch.append(ch2)
                except:
                    pass

        df_all = pd.concat(df_ch)
    return df_all
Пример #4
0
def inin():
    rut = deu.get_rut()
    try:
        if len(rut) > 1:
            pass
    except:
        return redirect(url_for("form_rut"))

    eco = ecomart.Ecomart()
    li_docs = eco.query_li_rut(rut)

    try:
        lista_i3_identity = []
        #li = li_docs['hits']['hits'][0]['_source']
        #df = json2html.convert(json = li)
        df_li = Select.from_dict(li_docs).to_pandas()
        #df_li.drop(columns=['_index', '_type', '_id', '_score'], inplace=True, errors='ingnore')
        df_li = df_li[['canal_id', 'table_name', 'dbname', 'i3_identity']]
        lista_i3_identity = df_li['i3_identity'].unique()
    except:
        pass

    if len(lista_i3_identity):
        df_ch = []
        for i3 in lista_i3_identity:
            ch_docs = eco.query_ch_i3(i3)
            if len(ch_docs) > 0:
                try:
                    ch = Select.from_dict(ch_docs).to_pandas()
                    ch2 = ch[[
                        'campaignname', 'canal_id', 'callconnectedtimeUTC',
                        'wcode_displayname', 'wcat_displayname', 'length',
                        'i3_identity'
                    ]]
                    df_ch.append(ch2)
                except:
                    pass

        df_all = pd.concat(df_ch)
    else:
        return 'sin datos'

    found, datos = get_basic_data(rut)
    if found:
        df_all.sort_values(by=['callconnectedtimeUTC'], inplace=True)
        # Genera chart de "tipo" timeline
        #chartt.gen_timeline(df_all)
        #tl_chart = chartt.gen_timeline2(df_all)
        graficar = 0
        if graficar == 1:
            chartt.chart_wcode_x_date(df_all)
        return render_template('inin.html',
                               rut=rut,
                               ap_pat=datos['apellido_paterno'],
                               ap_mat=datos['apellido_materno'],
                               primer_nombre=datos['primer_nombre'],
                               li=df_li,
                               ch=df_all,
                               graficar=graficar)
Пример #5
0
def process_hits(hits):
    for item in hits:
        # Process hits here
        a = Select.from_dict(data).to_pandas()
        print(a.shape)
        print(list(a.columns.values))
        return(a)
Пример #6
0
    def index_data_to_df(
        self,
        index_name: str,
        start_time: str,
        end_time: str,
        field_list: str,
    ):
        query = {"range": {"@timestamp": {"gte": start_time, "lte": end_time}}}
        sort = {"@timestamp": {"order": "asc"}, "_id": {"order": "asc"}}
        max_size = 5000
        search_after = [0, 0]
        pandas_df = pd.DataFrame()

        while True:
            result_dict = self._es_query(index_name, query, sort, search_after,
                                         max_size)
            if len(result_dict['hits']['hits']) == 0:
                break

            tmp = Select.from_dict(result_dict).to_pandas()
            pandas_df = pandas_df.append(tmp[field_list])

            search_after = [
                self._unix_time_millis(pandas_df['@timestamp'].iloc[-1]),
                pandas_df['_id'].iloc[-1]
            ]

        pandas_df.to_csv('input_ryu.csv')
        return pandas_df
Пример #7
0
    def getResultScrolling(self, partRes, partialSize=None):
        df = None
        scrollId = partRes['_scroll_id']

        # Do you want the whole index or a partial data ?
        scrollSize = partRes['hits']['total']['value']
        if partialSize:
            if partialSize <= scrollSize:
                scrollSize = partialSize

        while scrollSize > 0:
            #print(scrollSize)

            pandas_df = Select.from_dict(partRes).to_pandas()
            chunkSize = len(partRes['hits']['hits'])
            if df is None:
                df = pandas_df
            else:
                df = df.append(pandas_df, ignore_index=True)
            # arrayHits = partRes['hits']['hits']
            # for hit in arrayHits:
            #    for key, val in hit["_source"].items():
            #        try:
            #            fields[key] = np.append(fields[key], val)
            #        except KeyError:
            #            fields[key] = np.array([val])

            scrollSize = scrollSize - chunkSize
            partRes = self.scroll(scroll_id=scrollId)

        df.drop(columns=["_index", "_type", "_id", "_score"],
                axis=1,
                inplace=True)
        return df
Пример #8
0
def comportamiento():
    rut = deu.get_rut()
    try:
        if len(rut) > 1:
            pass
    except:
        return redirect(url_for("form_rut"))

    eco = ecomart.Ecomart()
    lu_docs = eco.query_lu_rut(rut)
    try:
        df = Select.from_dict(lu_docs).to_pandas()
        df.drop(columns=['_index', '_type', '_id', '_score'],
                inplace=True,
                errors='ingnore')
        df_lu = df[[
            'fecha_generacion', 'lista_cliente', 'motivo_no_pago',
            'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real',
            'saldo_castigado', 'sucursal_cobranza', 'banca_cliente'
        ]]
        df_lu.sort_values(by=['fecha_generacion'], inplace=True)

        df_ch = get_df_ch(rut)
        chartt.chart_comp(df_lu, df_ch)
        chartt.chart_wcode_x_date(df_ch)
        return render_template('comportamiento.html',
                               rut=rut,
                               dv=df['dv'][0],
                               ap_pat=df['apellido_paterno'][0],
                               ap_mat=df['apellido_materno'][0],
                               primer_nombre=df['primer_nombre'][0],
                               lu=df_lu)
    except Exception as e:
        return 'sin datos.' + str(e)
Пример #9
0
def deudor_head():
    rut = deu.get_rut()
    try:
        if len(rut) > 1:
            pass
    except:
        return redirect(url_for("form_rut"))

    eco = ecomart.Ecomart()
    lu_docs = eco.query_lu_rut(rut)
    try:
        #lu = lu_docs['hits']['hits'][0]['_source']
        #df = json2html.convert(json = lu)
        df = Select.from_dict(lu_docs).to_pandas()
        df.drop(columns=['_index', '_type', '_id', '_score'],
                inplace=True,
                errors='ingnore')
        df_lu = df[[
            'fecha_generacion', 'lista_cliente', 'motivo_no_pago',
            'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real'
        ]]
        df_lu.sort_values(by=['fecha_generacion'])
        otros_datos = get_otros_datos(df)
        return render_template('deudor.html',
                               rut=rut,
                               dv=df['dv'][0],
                               ap_pat=df['apellido_paterno'][0],
                               ap_mat=df['apellido_materno'][0],
                               primer_nombre=df['primer_nombre'][0],
                               otros_datos=otros_datos)
    except Exception as e:
        return 'sin data.' + str(e)
Пример #10
0
def getLessons(credentials):
    es = Elasticsearch([
        'http://' + credentials["username"] + ':' + credentials["password"] +
        '@' + credentials["ip_and_port"]
    ],
                       timeout=600)
    doc = {"query": {"term": {"isLesson": {"value": True, "boost": 1.0}}}}
    lessonsDF = pd.DataFrame()
    data = es.search(index="sentences", body=doc, scroll='1m')
    scrollId = data['_scroll_id']
    scrollSize = len(data['hits']['hits'])
    while scrollSize > 0:
        if lessonsDF.empty:
            lessonsDF = Select.from_dict(data).to_pandas()
        else:
            lessonsDF = lessonsDF.append(Select.from_dict(data).to_pandas())
        data = es.scroll(scroll_id=scrollId, scroll='1m')
        scrollId = data['_scroll_id']
        scrollSize = len(data['hits']['hits'])
    return lessonsDF
Пример #11
0
def getTopics(credentials):
    es = Elasticsearch([
        'http://' + credentials["username"] + ':' + credentials["password"] +
        '@' + credentials["ip_and_port"]
    ],
                       timeout=600)
    doc = {'size': 10000, 'query': {'match_all': {}}}
    topicsDF = pd.DataFrame()
    data = es.search(index="topics", body=doc, scroll='1m')
    scrollId = data['_scroll_id']
    scrollSize = len(data['hits']['hits'])
    while scrollSize > 0:
        if topicsDF.empty:
            topicsDF = Select.from_dict(data).to_pandas()
        else:
            topicsDF = topicsDF.append(Select.from_dict(data).to_pandas())
        data = es.scroll(scroll_id=scrollId, scroll='1m')
        scrollId = data['_scroll_id']
        scrollSize = len(data['hits']['hits'])
    return topicsDF
Пример #12
0
 def get_tweets(self, index_search):
     index_aux = "tweets-sentiment_" + index_search
     df1 = DataFrame.from_es(url="http://192.168.127.129:9200",
                             index=index_aux,
                             doc_type="new-tweet",
                             compat=6)
     es = Elasticsearch('http://192.168.127.129:9200')
     results = es.search(index=index_aux,
                         size=5600,
                         body={"query": {
                             "match_all": {}
                         }})
     df = Select.from_dict(results).to_pandas()
     return df
Пример #13
0
def ajax2():

    print('ajax2')
    rut = request.form.get('rut')
    print('rut:', rut, flush=True)
    print('Buscando para RUT:', rut, flush=True)

    mensaje = ''
    resultado = 0

    try:
        eco = ecomart.Ecomart()
        lu_docs = eco.query_lu_rut(rut)
        df = Select.from_dict(lu_docs).to_pandas()
        if (len(df.index) > 0):
            resultado = 1
            deu.set_rut(rut)
    except:
        mensaje = 'No encontre el rut consultado.'

    try:
        ap_pat = ''
        ap_pat = df['apellido_paterno'][0]
    except:
        ap_pat = 'Apellido Paterno, sin datos'
    try:
        ap_mat = ''
        ap_mat = df['apellido_materno'][0]
    except:
        ap_mat = 'Apellido Materno, sin datos'
    try:
        primer_nombre = ''
        primer_nombre = df['primer_nombre'][0]
    except:
        primer_nombre = 'Nombre, Sin datos'

    respuesta = {
        'resultado': resultado,
        'mensaje': mensaje,
        'rut': rut,
        'nombre': primer_nombre,
        'apellido_paterno': ap_pat,
        'apellido_materno': ap_mat,
        'edad': 'ND'
    }
    return jsonify(respuesta)
Пример #14
0
def get_basic_data(rut):
    found = False
    try:
        print('buscando en ecomart con RUT : ' + str(rut))
        eco = ecomart.Ecomart()
        lu_docs = eco.query_lu_rut(rut)
        df = Select.from_dict(lu_docs).to_pandas()
        print('largo df:', len(df.index), flush=True)
        if (len(df.index) > 0):
            deu.set_rut(rut)
            found = True
            print('encontro.')
            try:
                ap_pat = ''
                ap_pat = df['apellido_paterno'][0]
            except:
                ap_pat = 'Sin datos'
            try:
                ap_mat = ''
                ap_mat = df['apellido_materno'][0]
            except:
                ap_mat = 'Sin datos'
            try:
                primer_nombre = ''
                primer_nombre = df['primer_nombre'][0]
            except:
                primer_nombre = 'Sin datos'

            basic_data = {
                'apellido_paterno': ap_pat,
                'apellido_materno': ap_mat,
                'primer_nombre': primer_nombre,
            }
        else:
            basic_data = {
                'apellido_paterno': 'Sin datos',
                'apellido_materno': 'Sin datos',
                'primer_nombre': 'Sin datos'
            }
    except:
        basic_data = {
            'apellido_paterno': 'Sin datos',
            'apellido_materno': 'Sin datos',
            'primer_nombre': 'Sin datos'
        }
    return [found, basic_data]
def getProjectDetails(credentials, projectNumber):
    """Function to get PCR data using projectNumber
    Returns: dataframe with PCR data of a projectNumber
    """
    es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600)
    doc = {
        "query": {
            "term": {
                "projectNumber": {
                    "value": projectNumber,
                    "boost": 1.0
                }
            }
        }
    }
    data = es.search(index="pcrs", body=doc)
    projectDF = Select.from_dict(data).to_pandas()
    return projectDF
Пример #16
0
def setIsExtracted(fileName):
    es = Elasticsearch([
        'http://' + credentials["username"] + ':' + credentials["password"] +
        '@' + credentials["ip_and_port"]
    ],
                       timeout=600)
    doc = {"query": {"match_phrase": {"fileName": fileName}}}
    data = es.search(index="pcrs", body=doc)
    df = Select.from_dict(data).to_pandas()
    action = [{
        "_index": "pcrs",
        "_id": row["_id"],
        "_source": {
            "projectNumber": row["projectNumber"],
            "isExtracted": False,
            "tentative": row["tentative"],
            "title": row["title"],
            "sectors": row["sectors"],
            "countries": row["countries"],
            "themes": row["themes"],
            "downloadLink": row["downloadLink"],
            "fileName": row["fileName"],
            "monthYear": row["monthYear"],
            "month": row["month"],
            "year": row["year"],
            "milestoneApprovalDate": row["milestoneApprovalDate"],
            "milestoneEffectivityDate": row["milestoneEffectivityDate"],
            "milestoneSigningDate": row["milestoneSigningDate"],
            "safeguardCategories": row["safeguardCategories"],
            "sourceOfFunding": row["sourceOfFunding"],
            "modalitiesFromWebsite": row["modalitiesFromWebsite"],
            "modalitiesFromDump": row["modalitiesFromDump"],
            "uniqueModalitiesFromDump": row["uniqueModalitiesFromDump"]
        }
    } for index, row in df.iterrows()]
    es = Elasticsearch([
        'http://' + credentials["username"] + ':' + credentials["password"] +
        '@' + credentials["ip_and_port"]
    ],
                       timeout=600)
    helpers.bulk(es, action)
Пример #17
0
    def generate_es_resultset_file(self, request_id, output_file):
        initial_result_set = settings.ES.search(
            index=settings.ES_COMPUTED_RESULT_INDEX,
            body=self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY
        )  # to get size parameter
        resultset_size = initial_result_set['hits']['total']
        self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY['size'] = resultset_size
        result_set = settings.ES.search(
            index=settings.ES_COMPUTED_RESULT_INDEX,
            body=self.HUMAN_VERIFICATION_MATCH_REQUEST_BODY)
        downloadable_df = Select.from_dict(result_set).to_pandas()

        try:
            comp_logger.info(
                "Generating dataframe from ES resultset for id: {}".format(
                    request_id))
            downloadable_df = downloadable_df[settings.DOWNLOAD_HEADERS]
        except:
            # if there is no human verdict match\
            comp_logger.info(
                "ES resultset empty for human_verdict match for id: {} is empty"
                .format(request_id))
            downloadable_df = pd.DataFrame(columns=settings.DOWNLOAD_HEADERS)

        if not os.path.exists(output_file):
            comp_logger.info(
                "ES human_verdict match resultset initiated for id: {}".format(
                    request_id))
            downloadable_df.to_csv(output_file,
                                   sep='\t',
                                   index=False,
                                   encoding='iso-8859-1')
        else:
            comp_logger.info(
                "ES human_verdict match resultset already generated for id: {}"
                .format(request_id))
Пример #18
0
def lu():
    rut = deu.get_rut()
    try:
        if len(rut) > 1:
            pass
    except:
        return redirect(url_for("form_rut"))

    eco = ecomart.Ecomart()
    lu_docs = eco.query_lu_rut(rut)
    try:
        #lu = lu_docs['hits']['hits'][0]['_source']
        #df = json2html.convert(json = lu)
        df = Select.from_dict(lu_docs).to_pandas()
        df.drop(columns=['_index', '_type', '_id', '_score'],
                inplace=True,
                errors='ingnore')
        df_lu = df[[
            'fecha_generacion', 'lista_cliente', 'motivo_no_pago',
            'fase_inicio', 'fase_dia', 'saldo_cliente', 'dias_mora_real',
            'saldo_castigado', 'sucursal_cobranza', 'banca_cliente'
        ]]
        df_lu.sort_values(by=['fecha_generacion'], inplace=True)
        graficar = 0
        if graficar == 1:
            chartt.chart_lu_x_date(df_lu)
        return render_template('lu.html',
                               rut=rut,
                               dv=df['dv'][0],
                               ap_pat=df['apellido_paterno'][0],
                               ap_mat=df['apellido_materno'][0],
                               primer_nombre=df['primer_nombre'][0],
                               lu=df_lu,
                               graficar=graficar)
    except Exception as e:
        return 'sin datos. ' + str(e)
Пример #19
0
def predict(index):
    data = csv_reader(index_name=index, size=10000)
    # print(json.dumps(data, indent=4))
    df = Select.from_dict(data).to_pandas()

    df["Time"] = df["Time"].str.replace("T", " ").str[0:19]

    df_col_time = df["Time"].str.split(' ', expand=True)
    df["Date"] = df_col_time[0]
    df["Date_Time"] = df_col_time[1]

    df["Value_comb"] = '0'

    df.loc[pd.isnull(df["Value"]), "Value_comb"] = '0'
    df.loc[(df["Value"] > '0') & (df["Value"] < '0.000001'),
           "Value_comb"] = '1'
    df.loc[(df["Value"] >= '0.000001') & (df["Value"] < '0.00001'),
           "Value_comb"] = '2'
    df.loc[(df["Value"] >= '0.00001') & (df["Value"] < '0.0001'),
           "Value_comb"] = '3'
    df.loc[(df["Value"] >= '0.0001') & (df["Value"] < '0.001'),
           "Value_comb"] = '4'
    df.loc[(df["Value"] >= '0.001') & (df["Value"] < '0.01'),
           "Value_comb"] = '5'
    df.loc[(df["Value"] >= '0.01') & (df["Value"] < '0.1'), "Value_comb"] = '6'
    df.loc[(df["Value"] >= '0.1') & (df["Value"] < '1'), "Value_comb"] = '7'
    df.loc[(df["Value"] >= '1') & (df["Value"] < '10'), "Value_comb"] = '8'
    df.loc[(df["Value"] >= '10') & (df["Value"] < '100'), "Value_comb"] = '9'
    df.loc[(df["Value"] >= '100') & (df["Value"] < '1000'),
           "Value_comb"] = '10'
    df.loc[(df["Value"] >= '1000') & (df["Value"] < '10000'),
           "Value_comb"] = '11'
    df.loc[(df["Value"] >= '10000') & (df["Value"] < '100000'),
           "Value_comb"] = '12'
    df.loc[(df["Value"] >= '100000') & (df["Value"] < '1000000'),
           "Value_comb"] = '13'
    df.loc[(df["Value"] >= '1000000') & (df["Value"] < '10000000'),
           "Value_comb"] = '14'
    df.loc[(df["Value"] >= '10000000') & (df["Value"] < '100000000'),
           "Value_comb"] = '15'
    df.loc[(df["Value"] >= '100000000') & (df["Value"] < '1000000000'),
           "Value_comb"] = '16'
    df.loc[(df["Value"] >= '1000000000') & pd.notnull(df["Value"]),
           "Value_comb"] = '17'

    df["Value_comb"] = df["Value_comb"].astype(float)
    # data["Time"] = pd.to_datetime(data["Time"])

    df["Num_Date_Time"] = df["Date_Time"].str.replace(":", ".").str[0:5]
    # df["Num_Date_Time"] = df["Num_Date_Time"].str[3:8]
    df["Num_Date_Time"] = df["Num_Date_Time"].astype(float)

    new_data = df[["Num_Date_Time", "Value_comb"]].copy()

    indexnewData = new_data.set_index(["Num_Date_Time"])

    rollmean = indexnewData.rolling(window=13).mean()
    rollstd = indexnewData.rolling(window=13).std()

    new_data.to_csv('export_dataframe.csv')

    series = read_csv('export_dataframe.csv', header=0, index_col=0)

    series.set_index(['Num_Date_Time'], inplace=True)

    # create a difference transform of the dataset
    def difference(dataset):
        diff = list()
        for i in range(1, len(dataset)):
            value = dataset[i] - dataset[i - 1]
            diff.append(value)
        return numpy.array(diff)

    # Make a prediction give regression coefficients and lag obs
    def predict(coef, history):
        yhat = coef[0]
        for i in range(1, len(coef)):
            yhat += coef[i] * history[-i]
        return yhat

    # split dataset
    X = difference(series.values)
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:]

    # train autoregression
    window_size = 6

    model = AR(train)
    model_fit = model.fit(maxlag=window_size, disp=False)

    # save model to file
    # model_fit.save('ar_model.pkl')
    # save model using pickle
    pickle.dump(model_fit, open('pickle_model.pkl', 'wb'))
    # save the differenced dataset
    numpy.save('ar_data.npy', X)
    # save the last observation
    numpy.save('ar_obs.npy', [series.values[-1]])
    # save coefficients
    coef = model_fit.params
    numpy.save('man_model.npy', coef)
    # save lag
    lag = X[-window_size:]
    numpy.save('man_data.npy', lag)

    window = model_fit.k_ar
    coef = model_fit.params

    # walk forward over time steps in test
    history = [train[i] for i in range(len(train))]
    predictions = list()
    for t in range(len(test)):
        yhat = predict(coef, history)
        obs = test[t]
        predictions.append(yhat)
        history.append(obs)
    error = mean_squared_error(test, predictions)
    print('Test MSE: %.3f' % error)

    # load the AR model from file
    model = ARResults.load('pickle_model.pkl')
    # print(loaded.params)
    data = numpy.load('ar_data.npy')
    last_ob = numpy.load('ar_obs.npy')
    print(last_ob)
    coef = numpy.load('man_model.npy')
    print(coef)
    lag = numpy.load('man_data.npy')
    print(lag)

    # make prediction
    # prediction = predict(coef, lag)
    prediction = predict(coef, lag)
    # transform prediction
    yhat = prediction + last_ob[0]
    print('Prediction: %f' % yhat)

    x_predict = []
    y_predict = []
    x_test = []
    y_test = []

    for data in predictions:
        point = data.shape
        if len(point) == 1:
            x_predict.append(point[0])
            y_predict.append(None)
        else:
            x_predict.append(point[0])
            y_predict.append(point[1])

    for data in test:
        point = data.shape
        if len(point) == 1:
            x_test.append(point[0])
            y_test.append(None)
        else:
            x_test.append(point[0])
            y_test.append(point[1])

    # result = ((x_predict, y_predict), (x_test, y_test))
    result = (predictions, test)
    return result
Пример #20
0
from collections import Counter
import seaborn as sns

from pandasticsearch import Select


from operator import itemgetter 
from elasticsearch import Elasticsearch


"""Import Data drom ELasticsearch"""
es  = Elasticsearch(["localhost:9200"])

order = es.search(index='commandes',body={},size=1000)
customer = es.search(index='custommer',body={},size=1000)
order_df = Select.from_dict(order).to_pandas()
customer_df = Select.from_dict(customer).to_pandas()


"""Prepare DataSet With  GroupBy"""
#order_df.groupby(order_df.version.apply(lambda x: x['major'])).size()
"""Essay 1"""
def productReturn():
    for i in order_df.orderItems:
        for j in i:
            return j['product']


productReturn()
#cd = order_df.groupby('orderItems').apply(lambda x : x)
""""""
Пример #21
0
def update_task(words, progress, updaterate):

    startid = progress['task']

    if startid > 0:
        task_p = pd.read_json('data/pms_task.json')
    else:
        task_p = pd.DataFrame()

    while True:

        # sleep(1)

        result_dict = es.search(index="pms_task",
                                body={
                                    "query": {
                                        "constant_score": {
                                            "filter": {
                                                "range": {
                                                    "id": {
                                                        "gt":
                                                        startid,
                                                        "lt":
                                                        startid + updaterate +
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                size=10000)

        if len(result_dict['hits']['hits']) == 0:
            break

        task_new = Select.from_dict(result_dict).to_pandas()[[
            'name', 'id', 'desc'
        ]]

        # from nohtml import strip_tags
        # task_new['desc'] = task_new['desc'].apply(lambda x:strip_tags(x))
        task_new = task_new.loc[task_new['id'].drop_duplicates().index, :]

        task_new = task_new.set_index(task_new['id'].values)

        #task_new['id'] = task_new['id'].astype(str)

        task_new['seg'] = ''

        for i in task_new['id'].values:
            print(i)

            seg = segmentation(
                (task_new['name'][i] + task_new['desc'][i]).split())
            word_count = Counter(seg)
            print(word_count)
            wordn = sum(word_count.values())
            for word in word_count:
                word_count[word] = word_count[word] / wordn
            print(word_count)

            words += Counter(word_count.keys())
            task_new['seg'][i] = dict(word_count)

        task_p = pd.concat([task_p, task_new], axis=0)

        startid = np.sort(task_new['id'].values)[-1]

    progress['task'] = int(startid)

    task_p.to_json('data/seg/pms_task.json')
Пример #22
0
                                         ]
                                     }
                                 },
                                 {
                                     "range": {
                                         "m_LogDate": {
                                             "gte": "2020-12-21T00:00:00.000Z",
                                             "lte": "2020-12-28"
                                         }
                                     }
                                 },
                             ]
                         }
                     }
                 })

df = Select.from_dict(data).to_pandas()
df.drop_duplicates(subset='m_To', keep='last', inplace=True)
df.to_csv(directory + "blocksall1306maymta2.csv", index=False)

df['Blocker'] = df.apply(lambda row: blocker(row), axis=1)

to_dropcols = ['m_LogEntry', 'm_LogDate','_id', '_index', '_score', '_type',\
'm_From','m_LogType','m_MessageId', 'm_SubmissionDate']
df.drop(to_dropcols, axis=1, inplace=True)
print(df.shape)
print(list(df.columns.values))
print(df)
#print (data['hits'][1])10.csv"
df.to_csv(directory + "blocks28decmta1.csv", index=False)
Пример #23
0
def update_story(words, progress, updaterate):

    startid = progress['story']

    if startid > 0:
        story_p = pd.read_json('data/pms_story.json')
    else:
        story_p = pd.DataFrame()

    while True:

        #sleep(1)

        result_dict = es.search(index="pms_story",
                                body={
                                    "query": {
                                        "constant_score": {
                                            "filter": {
                                                "range": {
                                                    "story": {
                                                        "gt":
                                                        startid,
                                                        "lt":
                                                        startid + updaterate +
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                size=10000)

        if len(result_dict['hits']['hits']) == 0:
            break

        story_new = Select.from_dict(result_dict).to_pandas()[[
            'story', 'title', 'spec'
        ]]

        # sql = 'select story, title, spec from zt_storyspec where story > {0} limit {1}'.format(startid, updaterate)
        # story_new = pd.read_sql(sql, engine)
        # ifremain = (len(story_new) == updaterate)
        # print(len(story_new))
        #
        # print(ifremain)

        # from nohtml import strip_tags
        # story_new['spec'] = story_new['spec'].apply(lambda x:strip_tags(x))
        story_new = story_new.loc[
            story_new['story'].drop_duplicates().index, :]

        story_new = story_new.set_index(story_new['story'].values)

        story_new['seg'] = ''
        for i in story_new['story'].values:

            seg = segmentation(
                (story_new['title'][i] + story_new['spec'][i]).split())
            word_count = Counter(seg)
            print(i)
            print(word_count)
            wordn = sum(word_count.values())
            for word in word_count:
                word_count[word] = word_count[word] / wordn
            #print(word_count)

            words += Counter(word_count.keys())
            story_new['seg'][i] = dict(word_count)

        story_p = pd.concat([story_p, story_new], axis=0)

        startid = np.sort(story_new['story'].values)[-1]

        print(startid)

    progress['story'] = int(startid)

    story_p.to_json('data/seg/pms_story.json')
Пример #24
0
def update_bug(words, progress, updaterate):

    startid = progress['bug']

    if startid > 0:
        bug_p = pd.read_json('data/pms_bug.json')
    else:
        bug_p = pd.DataFrame()

    while True:

        # sleep(1)

        result_dict = es.search(index="pms_bug",
                                body={
                                    "query": {
                                        "constant_score": {
                                            "filter": {
                                                "range": {
                                                    "id": {
                                                        "gt":
                                                        startid,
                                                        "lt":
                                                        startid + updaterate +
                                                        1
                                                    }
                                                }
                                            }
                                        }
                                    }
                                },
                                size=10000)

        if len(result_dict['hits']['hits']) == 0:
            break

        bug_new = Select.from_dict(result_dict).to_pandas()[[
            'id', 'title', 'steps'
        ]]

        bug_new = bug_new.loc[bug_new['id'].drop_duplicates().index, :]

        bug_new = bug_new.set_index(bug_new['id'].values)

        bug_new['seg'] = ''
        for i in bug_new['id'].values:
            #print(i)

            seg = segmentation(
                (bug_new['title'][i] + bug_new['steps'][i]).split())
            word_count = Counter(seg)
            #print(word_count)
            wordn = sum(word_count.values())
            for word in word_count:
                word_count[word] = word_count[word] / wordn
            #print(word_count)

            words += Counter(word_count.keys())
            bug_new['seg'][i] = dict(word_count)

        bug_p = pd.concat([bug_p, bug_new], axis=0)

        startid = np.sort(bug_new['id'].values)[-1]
        print(startid)

    progress['bug'] = int(startid)

    bug_p.to_json('data/seg/pms_bug.json')
Пример #25
0
from elasticsearch import Elasticsearch
from pandasticsearch import Select
import pandas as pd
from k_anonymity import *
import util
import os
script_dir = os.path.dirname(__file__)


'''get elastic search data as Python dict'''
from elasticsearch import Elasticsearch
es = Elasticsearch('http://localhost:9200')
result_dict = es.search(index="kibana_sample_data_logs", body={"query": {"match_all": {}}},size=10000)
df = Select.from_dict(result_dict).to_pandas()

'''data cleaning'''
df=util.explode(df,['tags'])
for column in df.select_dtypes('object').columns:
    df[column]=df[column].astype('category')
df['response']=df['response'].astype('category')
print("Data type of columns:")
print(df.dtypes)

'''analyse data span'''
full_spans = get_spans(df, df.index)
print("Full span:")
print(full_spans)

'''choose column to be anonymized'''
feature_columns = ['geo.coordinates.lat', 'geo.coordinates.lon']
sensitive_column = 'response'
def get_unique_sensor_ids_around_geo_location(geo_shape, filter_by_sensor_types=None):
    if filter_by_sensor_types is None:
        filter_by_sensor_types = []
    
    size = 1000
    search_query = {
        "size": size,
        "query": {"bool": {}},
        "aggs": {
            "unique_sensor_ids": {
                "terms": {
                    "field": "sensor_id"
                }
            }
        }
    }
    
    geo_data = {
        "geo_polygon": {
            "ignore_unmapped": True,
            "geo_location": {
                "points": geo_shape
            }
        }
    }
    
    search_query["query"]["bool"]["filter"] = geo_data
    
    if filter_by_sensor_types:
        search_query["query"]["bool"]["must"] = {
            "terms": {
                "sensor_type": filter_by_sensor_types
            }
        }
    
    # query the results and pass a param: scroll=1m
    response = es.search(index=es_index_name, doc_type=es_doc_type, body=search_query, params={'scroll': '1m'})
    
    # get the scroll id
    scroll_id = response.get('_scroll_id')
    total_results = response['hits']['total']
    
    scroll_size = total_results
    
    from pandasticsearch import Select
    df = Select.from_dict(response).to_pandas()
    
    results_fetched = size
    
    while scroll_size > 0:
        page = es.scroll(scroll_id=scroll_id, scroll='2m')
        
        # Update the scroll ID
        scroll_id = page.get('_scroll_id')
        
        # Get the number of results that we returned in the last scroll
        scroll_size = len(page['hits']['hits'])
        
        message = "Fetching {}/{} results ({}%)".format(results_fetched, total_results,
                                                        round((results_fetched / total_results) * 100, 2))
        print(message)
        
        # Do something with the obtained page
        df_page_next = Select.from_dict(page).to_pandas()
        
        df = pd.concat([df, df_page_next], ignore_index=True)
        
        results_fetched += scroll_size
        
        print("")
    
    # get the unique sensor_id
    df_sensor_ids = df['sensor_id'].unique()
    
    # sort the ids
    df_sensor_ids.sort()
    
    unique_sensor_ids = list(df_sensor_ids)
    
    return unique_sensor_ids