def main(): opts = util.parse_args() X, y = util.data_load(opts.dataset) fc_nn_model = fc.create_model() ada_model = AdaBoostClassifier(n_estimators=100, random_state=0) svm_model = SVC(C=1000, gamma=0.1) n = opts.upsamplen if opts.upsamplen is not None else 1 start = n if opts.upsamplestart is None else 1 if start > n: print("unsample range error") sys.exit() conf_fc, conf_ada, conf_svm = [], [], [] for t in np.arange(start, n + 1): needed = util.needed_n(X, y, t) temp_X, temp_y = util.upsample(X, y, needed) X_train, X_test, y_train, y_test = train_test_split(temp_X, temp_y, test_size=0.3, random_state=42) X_train, X_test = util.normalize(X_train, X_test) train_dset = tf.data.Dataset.from_tensor_slices( (X_train, y_train)).batch(64, drop_remainder=False).shuffle(buffer_size=10000) test_dset = tf.data.Dataset.from_tensor_slices( (X_test, y_test)).batch(64) ada_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) fc_nn_model.fit(train_dset, epochs=10) pred_ada = ada_model.predict(X_test) pred_svm = svm_model.predict(X_test) conf_ada.append(confusion_matrix(y_test, pred_ada)) conf_svm.append(confusion_matrix(y_test, pred_svm)) temp = np.zeros((2, 2), dtype=int) for d, labels in test_dset: predictions = fc_nn_model(d) for i in range(len(d)): temp[labels[i]][np.argmax(predictions[i])] += 1 conf_fc.append(temp) recall_fc = list(map(lambda x: util.recall(x), conf_fc)) recall_ada = list(map(lambda x: util.recall(x), conf_ada)) recall_svm = list(map(lambda x: util.recall(x), conf_svm)) up_range = np.arange(start, n + 1) d = {"SVM": recall_svm, "Adaboost": recall_ada, "FC_NN": recall_fc} legends = ["SVM", "Adaboost", "FC_NN"] for key in d: plt.plot(up_range, d[key]) plt.title("Recall Vs Upsample Graph") plt.xlabel("Upsample rate") plt.ylabel("Recall") plt.legend(legends) plt.show()
def extract_and_upload( vid_path="vids", out_frame_skip=3, out_duration=4, use_roi=True, gif_color=False, gif_delay=8, quiet=False, remove_source=True, to_imgur=False, to_tumblr=False, to_snapchat=False, ): input_file = [file for file in os.listdir(vid_path) if not file.endswith("part") and not file.startswith(".")][0] # TEMP WORKAROUND: global last_snapchat global last_tumblr global last_imgur try: vid = SpelunkyVideo(os.path.join(vid_path, input_file)) vid.templates = get_templates(vid.template_scale) extract_death( vid, out_frame_skip=out_frame_skip, out_duration=out_duration, use_roi=use_roi, gif_color=gif_color, gif_delay=gif_delay, quiet=quiet, ) if to_imgur and time.time() - last_imgur > 0: upload_gif_imgur(vid) last_imgur = time.time() if to_tumblr and time.time() - last_tumblr > 0: upload_gif_tumblr(vid) last_tumblr = time.time() if to_snapchat and time.time() - last_snapchat >= 60 * 60 * 8: snapchat.login(SNAPCHAT_USER, SNAPCHAT_PASS) snapchat_followback() send_snapchat(vid) last_snapchat = time.time() os.remove(vid.out_mp4) if remove_source: os.remove(os.path.join(vid_path, input_file)) except (cv2.error, OSError, IOError, TypeError, AttributeError) as e: print e print "\nSkipping", vid.input_file, "likely due to failure to extract" print "moving to problems/", vid.input_file_tail os.rename(vid.input_file, "problems/" + vid.input_file_tail) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_tb(exc_traceback, limit=None, file=sys.stderr) sys.stderr.flush() util.recall(extract_and_upload)
def main(): opts = util.parse_args() X, y = util.data_load(opts.dataset) model = create_model() model_layers = create_model_more_layers() n = opts.upsamplen if opts.upsamplen is not None else 1 start = n if opts.upsamplestart is None else 1 all_conf = [] all_conf_layers = [] if start > n: print("Upsample start should be larger than end") sys.exit() for t in np.arange(start, n + 1): print("t", t) needed = util.needed_n(X, y, t) temp_X, temp_y = util.upsample(X, y, needed) X_train, X_test, y_train, y_test = train_test_split(temp_X, temp_y, test_size=0.3, random_state=42) X_train, X_test = util.normalize(X_train, X_test) train_dset = tf.data.Dataset.from_tensor_slices( (X_train, y_train)).batch(64, drop_remainder=False).shuffle(buffer_size=10000) test_dset = tf.data.Dataset.from_tensor_slices( (X_test, y_test)).batch(64) model.fit(train_dset, epochs=10) model_layers.fit(train_dset, epochs=10) conf_mat = np.zeros((2, 2), dtype=int) conf_mat_layers = np.zeros((2, 2), dtype=int) for d, labels in test_dset: predictions = model(d) predictions_layers = model_layers(d) for i in range(len(d)): conf_mat[labels[i]][np.argmax(predictions[i])] += 1 conf_mat_layers[labels[i]][np.argmax( predictions_layers[i])] += 1 all_conf.append(conf_mat) all_conf_layers.append(conf_mat_layers) re_fc, re_fc_layer = list(map(lambda x: util.recall(x), all_conf)), list( (map(lambda x: util.recall(x), all_conf_layers))) up_range = np.arange(start, n + 1) plt.plot(up_range, re_fc) plt.plot(up_range, re_fc_layer) plt.title("2-layer NN vs 3-layer NN") plt.legend(["2-layer", "3-layer"]) plt.xlabel("Upsample rate") plt.ylabel("Recall") plt.show()
def run_ours(Xtr, Ytr, Xt, Yt, lb, nsample, lambda_mode, q, sample_mode, k=None, rerun=True, eps=0.01, min_recall_per_class=0.8, log=None): #name = 'ours' if k is None else 'oursk' name = 'ours{}'.format(int(rerun)) k = k if k is not None else 100 dec = DecisionSet(eps) dec.train(Xtr, Ytr, max_k=k, nsamp=nsample, lamb=lambda_mode, q=q, mode=sample_mode, rerun=rerun, min_recall_per_class=min_recall_per_class) print('default:', dec.default) Xt_ = [Transaction(feat2item(t)) for t in Xt.values] Y_pred = dec.predict_all(Xt_) if log is None: from logger import log log('{}-default'.format(name), dec.default) log('{}-k'.format(name), len(dec.rules)) log('{}-maxk'.format(name), k) [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)] log('{}-q'.format(name), q) log('{}-nsample'.format(name), nsample) log('{}-lamb'.format(name), lambda_mode) log('{}-seq'.format(name), dec.seq) log('{}-auc'.format(name), roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred)) log('{}-disp'.format(name), dispersion(dec.rules, average=True)) log('{}-overlap'.format(name), overlap(dec.rules)) log('{}-mode'.format(name), sample_mode) [ log('{}-precisions-tr'.format(name), v, l) for l, v in precision(dec).items() ] [ log('{}-recall-tr'.format(name), v, l) for l, v in recall(dec.rules).items() ] print(confusion_matrix(Yt, Y_pred)) return Y_pred
def set_default(self, label=None): '''The most under-represented class''' if label is not None: self.default = label return label rc = recall(self.rules) idx = np.argmin([rc[label] for label in Itemset.labels]) deft = Itemset.labels[idx] self.default = deft return self.default
def calculateLSTMaccuracy(receipts, results): total_price_total = 0 total_price_found = 0 total_price_correct = 0 currency_total = 0 currency_found = 0 currency_correct = 0 date_total = 0 date_found = 0 date_correct = 0 vendor_total = 0 vendor_found = 0 vendor_correct = 0 tax_rate_total = 0 tax_rate_found = 0 tax_rate_correct = 0 address_total = 0 address_found = 0 address_correct = 0 products_total = 0 products_found = 0 products_correct = 0 count = 0 for i, receipt in enumerate(receipts): corr = True ## Check total price if 'total_price' in results[i]: price = results[i]['total_price'].replace(',','.') to_remove = [] for p in price: if util.isInt(p) or p == '.': continue to_remove.append(p) for p in to_remove: price = price.replace(p, '') if price.count('.') == 2: index = price.index('.') price = price[0 : index : ] + price[index + 1 : :] elif price.count('.') == 1 and len(price.split('.')[-1]) > 2: price = price.replace('.', '') else: price = None if price and price != '': total_price_found+=1 if 'total_price' in receipt.groundTruth: total_price_total+= 1 if compare.totalPrice(receipt.groundTruth['total_price'], price): total_price_correct += 1 else: corr = False ## Check currecy if 'currency' in results[i]: currency = results[i]['currency'] to_remove = [] for c in currency: if c.isalpha(): continue to_remove.append(c) for c in to_remove: currency = currency.replace(c, '') else: currency = None if currency and currency != '': currency_found+=1 if 'currency' in receipt.groundTruth: currency_total+=1 if compare.currency(receipt.groundTruth['currency'], currency): currency_correct += 1 else: corr = False ## Check date if 'date' in results[i]: date = results[i]['date'] split = date.split(' ') if len(split) == 2: date = split[0] if len(split[1]) > len(split[0]): date = split[1] else: date = None if date and date != '': date_found+=1 if 'date' in receipt.groundTruth: date_total+=1 if compare.date(receipt.groundTruth['date'],date): date_correct += 1 else: corr = False ## Check vendor if 'vendor' in results[i]: vendor = results[i]['vendor'] else: vendor = None if vendor and vendor != '': vendor_found +=1 if 'vendor' in receipt.groundTruth: vendor_total+=1 if compare.vendor(receipt.groundTruth['vendor'], vendor): vendor_correct += 1 else: corr = False ## Check tax rate if 'tax_rate' in results[i]: tax = results[i]['tax_rate'] split = tax.split(' ') if len(split) == 2: tax = split[0] else: tax = None if tax and tax != '': tax_rate_found+=1 if 'tax_rate' in receipt.groundTruth: tax_rate_total+=1 if compare.taxRate(receipt.groundTruth['tax_rate'], tax): tax_rate_correct += 1 else: corr = False ## Check address if 'address' in results[i]: address = results[i]['address'] else: address = None if address and address != '': address_found += 1 if 'address' in receipt.groundTruth: address_total+=1 if compare.address(receipt.groundTruth['address'], address): address_correct += 1 else: corr = False if 'products' in receipt.groundTruth: products_total += len(receipt.groundTruth['products']) if 'products' in results[i]: products = results[i]['products'] found = [] for product in products: product['amount'] = 1 products_found += 1 if not 'name' in product: continue if 'products' in receipt.groundTruth: real_products = receipt.groundTruth['products'] for j,real_product in enumerate(real_products): if j in found: continue if compare.products(product, real_product): found.append(j) products_correct += 1 break totalDataPoints = vendor_total + date_total + address_total + tax_rate_total + total_price_total + currency_total + products_total totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(receipts)) print('-----VENDORS-----') print(vendor_total, vendor_found, vendor_correct) precision = util.precision(vendor_correct, vendor_found) recall = util.recall(vendor_total, vendor_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(date_total, date_found, date_correct) precision = util.precision(date_correct, date_found) recall = util.recall(date_total, date_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(address_total, address_found, address_correct) precision = util.precision(address_correct, address_found) recall = util.recall(address_total, address_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(tax_rate_total, tax_rate_found, tax_rate_correct) precision = util.precision(tax_rate_correct, tax_rate_found) recall = util.recall(tax_rate_total, tax_rate_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(total_price_total, total_price_found, total_price_correct) precision = util.precision(total_price_correct, total_price_found) recall = util.recall(total_price_total, total_price_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currency_total, currency_found, currency_correct) precision = util.precision(currency_correct, currency_found) recall = util.recall(currency_total, currency_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products_total, products_found, products_correct) precision = util.precision(products_correct, products_found) recall = util.recall(products_total, products_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def calculateRuleBasedAccuracy(receipts): total_price_total = 0 total_price_found = 0 total_price_correct = 0 currency_total = 0 currency_found = 0 currency_correct = 0 date_total = 0 date_found = 0 date_correct = 0 vendor_total = 0 vendor_found = 0 vendor_correct = 0 tax_rate_total = 0 tax_rate_found = 0 tax_rate_correct = 0 address_total = 0 address_found = 0 address_correct = 0 products_total = 0 products_found = 0 products_correct = 0 count = 0 for receipt in receipts: corr = True ## Check total price if 'total_price' in receipt.ruleBasedPrediction: price = receipt.ruleBasedPrediction['total_price'] else: price = None if price: total_price_found+=1 if 'total_price' in receipt.groundTruth: total_price_total+= 1 if compare.totalPrice(receipt.groundTruth['total_price'], price): total_price_correct += 1 else: corr = False ## Check currecy if 'currency' in receipt.ruleBasedPrediction: currency = receipt.ruleBasedPrediction['currency'] else: currency = None if currency: currency_found+=1 if 'currency' in receipt.groundTruth: currency_total+=1 if compare.currency(receipt.groundTruth['currency'], currency): currency_correct += 1 else: corr = False ## Check date if 'date' in receipt.ruleBasedPrediction: date = receipt.ruleBasedPrediction['date'] else: date = None if date: date_found+=1 if 'date' in receipt.groundTruth: date_total+=1 if compare.date(receipt.groundTruth['date'],date): date_correct += 1 else: corr = False ## Check vendor if 'vendor' in receipt.ruleBasedPrediction: vendor = receipt.ruleBasedPrediction['vendor'] else: vendor = None if vendor: vendor_found +=1 if 'vendor' in receipt.groundTruth: vendor_total+=1 if compare.vendor(receipt.groundTruth['vendor'], vendor): vendor_correct += 1 else: corr = False ## Check tax rate if 'tax_rate' in receipt.ruleBasedPrediction: tax = receipt.ruleBasedPrediction['tax_rate'] else: tax = None if tax: tax_rate_found+=1 if 'tax_rate' in receipt.groundTruth: tax_rate_total+=1 if compare.taxRate(receipt.groundTruth['tax_rate'], tax): tax_rate_correct += 1 else: corr = False ## Check address if 'address' in receipt.ruleBasedPrediction: address = receipt.ruleBasedPrediction['address'] else: address = None if address: address_found += 1 if 'address' in receipt.groundTruth: address_total+=1 if compare.address(receipt.groundTruth['address'], address): address_correct += 1 else: corr = False ## Check products if 'products' in receipt.ruleBasedPrediction: products = receipt.ruleBasedPrediction['products'] else: products = [] found = [] if 'products' in receipt.groundTruth: products_total+= len(receipt.groundTruth['products']) for product in products: products_found += 1 if 'products' in receipt.groundTruth: real_products = receipt.groundTruth['products'] for j,real_product in enumerate(real_products): if j in found: continue if compare.products(product, real_product): found.append(j) products_correct += 1 break if 'products' in receipt.groundTruth: if len(found) < len(receipt.groundTruth['products']): corr = False if corr: count+=1 totalDataPoints = vendor_total + date_total + address_total + tax_rate_total + total_price_total + currency_total + products_total totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(receipts)) print('-----VENDORS-----') print(vendor_total, vendor_found, vendor_correct) precision = util.precision(vendor_correct, vendor_found) recall = util.recall(vendor_total, vendor_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(date_total, date_found, date_correct) precision = util.precision(date_correct, date_found) recall = util.recall(date_total, date_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(address_total, address_found, address_correct) precision = util.precision(address_correct, address_found) recall = util.recall(address_total, address_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(tax_rate_total, tax_rate_found, tax_rate_correct) precision = util.precision(tax_rate_correct, tax_rate_found) recall = util.recall(tax_rate_total, tax_rate_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(total_price_total, total_price_found, total_price_correct) precision = util.precision(total_price_correct, total_price_found) recall = util.recall(total_price_total, total_price_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currency_total, currency_found, currency_correct) precision = util.precision(currency_correct, currency_found) recall = util.recall(currency_total, currency_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products_total, products_found, products_correct) precision = util.precision(products_correct, products_found) recall = util.recall(products_total, products_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def calculateMetrics(reciepts, result, writeToFile=False, path=None): correctVendors = 0 vendorsFound = 0 vendors = 0 correctDates = 0 datesFound = 0 dates = 0 correctAddresses = 0 addressesFound = 0 addresses = 0 correctTaxes = 0 taxesFound = 0 taxes = 0 correctPrices = 0 pricesFound = 0 prices = 0 correctCurrencies = 0 currenciesFound = 0 currencies = 0 correctProducts = 0 productsFound = 0 products = 0 result_dict = {} count = 0 for i, reciept in enumerate(reciepts): corr = True vendor = result[i]['vendor'] result_dict['vendor'] = vendor if vendor: vendorsFound += 1 vendor = vendor.lower() if 'vendor' in reciept.groundTruth: vendors += 1 if vendor and levenshtein_distance( vendor, reciept.groundTruth['vendor'].lower()) <= 0: correctVendors += 1 else: corr = False date = result[i]['date'] result_dict['date'] = date if date: datesFound += 1 date = date.lower() if 'date' in reciept.groundTruth: dates += 1 if date == reciept.groundTruth['date'].lower( ) or date == reciept.groundTruth['date'].lower().replace(' ', ''): correctDates += 1 else: corr = False address = result[i]['address'] result_dict['address'] = address if address: addressesFound += 1 address = address.lower() if 'address' in reciept.groundTruth: addresses += 1 if address and levenshtein_distance( address, reciept.groundTruth['address'].lower()) <= 0: correctAddresses += 1 else: corr = False tax = result[i]['tax_rate'] result_dict['tax_rate'] = tax if tax != None: taxesFound += 1 if 'tax_rate' in reciept.groundTruth: taxes += 1 real_tax = int( float(reciept.groundTruth['tax_rate'].lower().replace('%', ''))) if tax == real_tax: correctTaxes += 1 else: corr = False price = result[i]['total_price'] result_dict['total_price'] = price if price: pricesFound += 1 if 'total_price' in reciept.groundTruth: prices += 1 real_price = float(reciept.groundTruth['total_price'].lower()) if price == real_price: correctPrices += 1 else: corr = False currency = result[i]['currency'] result_dict['currency'] = currency if currency: currenciesFound += 1 currency = currency.lower() if 'currency' in reciept.groundTruth: currencies += 1 if currency == reciept.groundTruth['currency'].lower(): correctCurrencies += 1 else: corr = False productsList = result[i]['products'] result_dict['products'] = productsList if 'products' in reciept.groundTruth: for product in reciept.groundTruth['products']: products += 1 checkedIndexes = [] for product in productsList: productsFound += 1 for i, real_product in enumerate(reciept.groundTruth['products']): if i in checkedIndexes: continue price = None if 'price' in product: price = product['price'].replace(',', '.') try: price = float(price) except: price = None real_price = real_product['price'] real_price = float(real_price) if levenshtein_distance(product['name'].lower(), real_product['name'].lower()) <= 0: if util.floatCompare(price, real_price): if product['amount'] == real_product['amount']: correctProducts += 1 checkedIndexes.append(i) break if len(checkedIndexes) < len(reciept.groundTruth['products']): corr |= False if corr: count += 1 if writeToFile: with open(os.path.join(path, reciept.path), 'w') as fp: json.dump(result_dict, fp, indent=1) totalDataPoints = vendors + dates + addresses + taxes + prices + currencies + products totalDataPointsFound = vendorsFound + datesFound + addressesFound + taxesFound + pricesFound + currenciesFound + productsFound totalCorrect = correctVendors + correctDates + correctAddresses + correctTaxes + correctPrices + correctCurrencies + correctProducts total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(reciepts)) print('-----VENDORS-----') print(vendors, vendorsFound, correctVendors) precision = util.precision(correctVendors, vendorsFound) recall = util.recall(vendors, correctVendors) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(dates, datesFound, correctDates) precision = util.precision(correctDates, datesFound) recall = util.recall(dates, correctDates) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(addresses, addressesFound, correctAddresses) precision = util.precision(correctAddresses, addressesFound) recall = util.recall(addresses, correctAddresses) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(taxes, taxesFound, correctTaxes) precision = util.precision(correctTaxes, taxesFound) recall = util.recall(taxes, correctTaxes) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(prices, pricesFound, correctPrices) precision = util.precision(correctPrices, pricesFound) recall = util.recall(prices, correctPrices) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currencies, currenciesFound, correctCurrencies) precision = util.precision(correctCurrencies, currenciesFound) recall = util.recall(currencies, correctCurrencies) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products, productsFound, correctProducts) precision = util.precision(correctProducts, productsFound) recall = util.recall(products, correctProducts) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def train(self, X, Y, max_k=100, nsamp=100, lamb=None, q='kl', mode=3, rerun=True, min_recall_per_class=0.5): print('##### START #####') Itemset.clear_db() prep_db(X, Y) # Allow specify lamb to a certain number by users if type(lamb) == str or lamb is None: samp = self.sample_from_each_label(set(Itemset.labels), 100, set(), mode) if lamb == 'max': lamb = np.max([Rule.quality([r], metric=q) for r in samp]) elif lamb == 'mean': lamb = np.mean([Rule.quality([r], metric=q) for r in samp]) else: lamb = 0 print('lamb:', lamb) greed = GreedyDiv([], lamb) U_all = [] labels_samp = set(Itemset.labels) while len(self) < max_k and len(labels_samp) > 0: if mode == 0: samps = [] for label in labels_samp: _, samp = sample_rn(nsamp, label) samp = [Rule(s, label) for s in list(samp)] # Very time-consuming samps.extend(samp) U = set(samps) else: covered = set([t for r in self.rules for t in r.trans()]) U = self.sample_from_each_label(labels_samp, nsamp, covered, mode) print('nsamp (after):', len(U)) if len(U) == 0: break U_all.extend(U) # Greedy greed.update_univ(U) r = greed.greedy_once() # Termination criteria. Also check zero sampling above. if self.enough(r): # Include at least one rule per class, except default class. labels_samp.remove(r.label) print('remove label:', r.label) else: # Print quality vs. dispersion q, d = obj(self.rules, lamb, sep=True) qr, dr = obj(self.rules + [r], lamb, sep=True) print('inc q vs. d: {}, {}'.format(qr - q, dr - d)) self.add(r) if np.abs(recall(self.rules)[r.label] - 1.0) < 1e-8: labels_samp.remove(r.label) print('#{} '.format(len(self.rules)), end='') printRules([r]) # Consecutive greedy over all sampels if rerun: greed.clear() greed.update_univ(list(set(U_all))) rules = greed.greedy(len(self.rules)) if obj(rules, lamb) > obj(self.rules, lamb): print('Full greedy wins: {} > {}'.format( obj(rules, lamb), obj(self.rules, lamb))) self.reset(rules) default = self.set_default() print('default:', default) self.build() print('precision: ', precision(self).items()) print('recall (coverage): ', recall(self.rules).items()) print('ave disp: ', dispersion(self.rules, average=True)) print('##### END #####')
def enough(self, r: Rule) -> bool: rc_cur = recall(self.rules) rc_aft = recall(self.rules + [r]) if rc_aft[r.label] - rc_cur[r.label] <= self.recall_eps: return True return False