示例#1
0
def computeBenfordLawExcelDataSet():
    fileName = request.args.get('fileName')
    column = request.args.get('columnName')

    if isEmptyString(fileName) or isEmptyString(column):
        return "fileName or Column is Empty"

    ds = pd.read_excel(fileName, skipinitialspace=True)

    # Check column name
    if (isInputColumnNameValid(ds, column) == False):
        return "Column:" + column + " Does not exists in the dataset"
    else:
        if (findColumnCount(ds, column) > 0):
            simpleReturnCol = 'simpleReturn' + '_' + column
            logReturnCol = 'logReturn' + '_' + column
            ds[simpleReturnCol] = ds[column] / ds[column].shift() - 1
            ds[logReturnCol] = np.log(ds[column] / ds[column].shift())

            bf.first_digits(ds[logReturnCol], digs=1,
                            decimals=8)  # digs=1 for the first digit (1-9)

            return "Benford's Law verification for the data set in the file:" + fileName + "-" + "Complete"

        else:
            return "Data set is empty / Column" + "-" + column + "-Does not have any data"
示例#2
0
def computeBenfordLawWebDataSetService():
    dataURL = request.args.get('dataURL')
    column = request.args.get('columnName')

    if isEmptyString(dataURL) or isEmptyString(column):
        return "dataURL or Column is Empty"

    column_names = [
        'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
        'A12', 'A13', 'A14', 'A15'
    ]

    dataset = pd.read_csv(dataURL, delim_whitespace=True, header=None)

    ds = pd.DataFrame(dataset.values, columns=column_names)

    # Check column name
    if (isInputColumnNameValid(ds, column) == False):
        return "Column:" + column + " Does not exists in the dataset"
    else:
        if (findColumnCount(ds, column) > 0):
            simpleReturnCol = 'simpleReturn' + '_' + column
            logReturnCol = 'logReturn' + '_' + column
            ds[simpleReturnCol] = ds[column] / ds[column].shift() - 1
            ds[logReturnCol] = np.log(ds[column] / ds[column].shift())
            bf.first_digits(ds[logReturnCol], digs=1,
                            decimals=8)  # digs=1 for the first digit (1-9)
            return "Benford's Law verification for Fraud Detection is complete."
        else:
            return "Data set is empty / Column" + "-" + column + "-Does not have any data"
示例#3
0
from math import log10, floor
count = [0]*10*10

def most_significant_digits(num,n):  
    import math
    if (num == 0):
        return 0
    return int(str(num)[:2])

for i,row in uik_data.iterrows():
    x = abs(row[putin])
    if(x==0):
        continue
    count[ most_significant_digits(x,2) ] += 1
    
    
total = sum(count)
benford = [total*log10(1 + 1./i) for i in range(1, 101)]
plt.bar(range(1,101),count)
plt.bar(range(1,101),benford,alpha = 0.4)
plt.xlabel("Digit")
plt.ylabel("No of uik's")
plt.legend((putin, "Benford's Law"))
tests = []
tests = uik_data.groupby('region').apply(lambda x: bf.first_digits(x[putin].astype('float'), digs=1))


f1d = bf.first_digits(uik_data[valid].astype('float'), digs=1) # digs=1 for the first digit (1-9)
f1d = bf.first_digits(uik_data[putin].astype('float'), digs=2) # digs=1 for the first digit (1-9)
f1d = bf.first_digits(uik_data[total_voters].astype('float'), digs=2) # digs=1 for the first digit (1-9)
f3d = bf.first_digits(uik_data[putin].astype('float'), digs=1)
示例#4
0
    def process(path_file):
        text = ""
        filename = secure_filename(path_file)
        if filename != '':
            conn = sqlite3.connect(os.path.join(current_app.config['DB_PATH'],
                                                'auditree.db'),
                                   timeout=10)
            c = conn.cursor()
            c.execute(
                "SELECT text_raw,text_network_object,benford_object FROM corpus WHERE filename = ?",
                [filename])
            data = c.fetchall()
            if len(data) != 0:
                file_path = os.path.join(current_app.config['UPLOAD_PATH'],
                                         filename)
                if list(data[0])[0] is None:
                    # content raw
                    content_text = Corpus.get_text_from_pdf(file_path)
                    c.execute(
                        "UPDATE corpus SET text_raw = ? WHERE filename = ?",
                        [content_text, filename])

                if list(data[0])[1] is None:
                    # text network
                    reader = PyPDF2.PdfFileReader(file_path)
                    for page in reader.pages:
                        text += page.extractText()

                    # cleaning
                    text = text.replace("\n", ' ')
                    text = text.replace(":", ' ')
                    text = text.replace(".", ' ')
                    text = text.replace("/", ' ')
                    text = text.replace(",", ' ')
                    text = text.replace("(", ' ')
                    text = text.replace(")", ' ')
                    text = text.lower()
                    text = ' '.join(text.split())
                    text = text.translate({ord(k): None for k in digits})

                    # stemming
                    factory = StemmerFactory()
                    stemmer = factory.create_stemmer()
                    text = stemmer.stem(text)

                    # remover
                    factory = StopWordRemoverFactory()
                    stopword = factory.create_stop_word_remover()
                    text = stopword.remove(text)

                    # process graph
                    graph = Corpus.process_graph(text)
                    c.execute(
                        "UPDATE corpus SET text_network_object = ? WHERE filename = ?",
                        [json.dumps(graph), filename])

                if list(data[0])[2] is None:
                    p = re.compile(r"^(19|20)\d{2}$")
                    res = []
                    reader = PyPDF2.PdfFileReader(file_path)
                    for page in reader.pages:
                        arr = re.findall(r"[\d.]*\d+", page.extractText())
                        for a in arr:
                            a = str(a)
                            if p.findall(a):
                                continue
                            a = a.replace(".", "")
                            if len(a) > 17:
                                a = a[:16]
                            res.append(a)
                    df = pd.DataFrame(list(res), columns=['nilai'])
                    fld = bf.first_digits(data=df['nilai'].astype(np.float),
                                          digs=1,
                                          decimals=8,
                                          confidence=95)
                    result_json = fld.to_json(orient="split")
                    result = json.loads(result_json)
                    c.execute(
                        "UPDATE corpus SET benford_object = ? WHERE filename = ?",
                        [result_json, filename])
            conn.commit()
            conn.close()
        return redirect(
            url_for('home_blueprint.index_filename', filename=filename))
#jupyter notebook'u python file'a çevirince aşağıdaki satır otomatik geliyor. Bunu kapatmak lazım.
#get_ipython().run_line_magic('matplotlib', 'inline')
import os
import numpy as np
import pandas as pd

os.chdir('C:\\Users\\meric\Desktop\codes')

# In[26]:

sp = pd.read_csv('SPY.csv', index_col='Date', parse_dates=True)

# In[27]:

#adding '_' to facilitate handling the column
sp.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
sp['p_r'] = sp.Close / sp.Close.shift() - 1  #simple returns
sp['l_r'] = np.log(sp.Close / sp.Close.shift())  #log returns
# sp.tail()

# In[28]:

print(bf.first_digits(sp.l_r, digs=1, decimals=8),
      file=open("OutputBenfordtxt.txt", "a"))
bf.first_digits(sp.l_r,
                digs=1,
                decimals=8,
                show_plot=True,
                save_plot='C:\\Users\meric\Desktop\codes\OutputBenford.pdf')