def computeBenfordLawExcelDataSet(): fileName = request.args.get('fileName') column = request.args.get('columnName') if isEmptyString(fileName) or isEmptyString(column): return "fileName or Column is Empty" ds = pd.read_excel(fileName, skipinitialspace=True) # Check column name if (isInputColumnNameValid(ds, column) == False): return "Column:" + column + " Does not exists in the dataset" else: if (findColumnCount(ds, column) > 0): simpleReturnCol = 'simpleReturn' + '_' + column logReturnCol = 'logReturn' + '_' + column ds[simpleReturnCol] = ds[column] / ds[column].shift() - 1 ds[logReturnCol] = np.log(ds[column] / ds[column].shift()) bf.first_digits(ds[logReturnCol], digs=1, decimals=8) # digs=1 for the first digit (1-9) return "Benford's Law verification for the data set in the file:" + fileName + "-" + "Complete" else: return "Data set is empty / Column" + "-" + column + "-Does not have any data"
def computeBenfordLawWebDataSetService(): dataURL = request.args.get('dataURL') column = request.args.get('columnName') if isEmptyString(dataURL) or isEmptyString(column): return "dataURL or Column is Empty" column_names = [ 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15' ] dataset = pd.read_csv(dataURL, delim_whitespace=True, header=None) ds = pd.DataFrame(dataset.values, columns=column_names) # Check column name if (isInputColumnNameValid(ds, column) == False): return "Column:" + column + " Does not exists in the dataset" else: if (findColumnCount(ds, column) > 0): simpleReturnCol = 'simpleReturn' + '_' + column logReturnCol = 'logReturn' + '_' + column ds[simpleReturnCol] = ds[column] / ds[column].shift() - 1 ds[logReturnCol] = np.log(ds[column] / ds[column].shift()) bf.first_digits(ds[logReturnCol], digs=1, decimals=8) # digs=1 for the first digit (1-9) return "Benford's Law verification for Fraud Detection is complete." else: return "Data set is empty / Column" + "-" + column + "-Does not have any data"
from math import log10, floor count = [0]*10*10 def most_significant_digits(num,n): import math if (num == 0): return 0 return int(str(num)[:2]) for i,row in uik_data.iterrows(): x = abs(row[putin]) if(x==0): continue count[ most_significant_digits(x,2) ] += 1 total = sum(count) benford = [total*log10(1 + 1./i) for i in range(1, 101)] plt.bar(range(1,101),count) plt.bar(range(1,101),benford,alpha = 0.4) plt.xlabel("Digit") plt.ylabel("No of uik's") plt.legend((putin, "Benford's Law")) tests = [] tests = uik_data.groupby('region').apply(lambda x: bf.first_digits(x[putin].astype('float'), digs=1)) f1d = bf.first_digits(uik_data[valid].astype('float'), digs=1) # digs=1 for the first digit (1-9) f1d = bf.first_digits(uik_data[putin].astype('float'), digs=2) # digs=1 for the first digit (1-9) f1d = bf.first_digits(uik_data[total_voters].astype('float'), digs=2) # digs=1 for the first digit (1-9) f3d = bf.first_digits(uik_data[putin].astype('float'), digs=1)
def process(path_file): text = "" filename = secure_filename(path_file) if filename != '': conn = sqlite3.connect(os.path.join(current_app.config['DB_PATH'], 'auditree.db'), timeout=10) c = conn.cursor() c.execute( "SELECT text_raw,text_network_object,benford_object FROM corpus WHERE filename = ?", [filename]) data = c.fetchall() if len(data) != 0: file_path = os.path.join(current_app.config['UPLOAD_PATH'], filename) if list(data[0])[0] is None: # content raw content_text = Corpus.get_text_from_pdf(file_path) c.execute( "UPDATE corpus SET text_raw = ? WHERE filename = ?", [content_text, filename]) if list(data[0])[1] is None: # text network reader = PyPDF2.PdfFileReader(file_path) for page in reader.pages: text += page.extractText() # cleaning text = text.replace("\n", ' ') text = text.replace(":", ' ') text = text.replace(".", ' ') text = text.replace("/", ' ') text = text.replace(",", ' ') text = text.replace("(", ' ') text = text.replace(")", ' ') text = text.lower() text = ' '.join(text.split()) text = text.translate({ord(k): None for k in digits}) # stemming factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) # remover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) # process graph graph = Corpus.process_graph(text) c.execute( "UPDATE corpus SET text_network_object = ? WHERE filename = ?", [json.dumps(graph), filename]) if list(data[0])[2] is None: p = re.compile(r"^(19|20)\d{2}$") res = [] reader = PyPDF2.PdfFileReader(file_path) for page in reader.pages: arr = re.findall(r"[\d.]*\d+", page.extractText()) for a in arr: a = str(a) if p.findall(a): continue a = a.replace(".", "") if len(a) > 17: a = a[:16] res.append(a) df = pd.DataFrame(list(res), columns=['nilai']) fld = bf.first_digits(data=df['nilai'].astype(np.float), digs=1, decimals=8, confidence=95) result_json = fld.to_json(orient="split") result = json.loads(result_json) c.execute( "UPDATE corpus SET benford_object = ? WHERE filename = ?", [result_json, filename]) conn.commit() conn.close() return redirect( url_for('home_blueprint.index_filename', filename=filename))
#jupyter notebook'u python file'a çevirince aşağıdaki satır otomatik geliyor. Bunu kapatmak lazım. #get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd os.chdir('C:\\Users\\meric\Desktop\codes') # In[26]: sp = pd.read_csv('SPY.csv', index_col='Date', parse_dates=True) # In[27]: #adding '_' to facilitate handling the column sp.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True) sp['p_r'] = sp.Close / sp.Close.shift() - 1 #simple returns sp['l_r'] = np.log(sp.Close / sp.Close.shift()) #log returns # sp.tail() # In[28]: print(bf.first_digits(sp.l_r, digs=1, decimals=8), file=open("OutputBenfordtxt.txt", "a")) bf.first_digits(sp.l_r, digs=1, decimals=8, show_plot=True, save_plot='C:\\Users\meric\Desktop\codes\OutputBenford.pdf')