예제 #1
0
 def send_config(config_path,
                 token_file,
                 server_address="https://api.neuralet.io"):
     """
     Send a config to server for starting a new job
     :param config_path: Path of .json config file
     :param server_address: Server address
     :param token_file: Path of token file
     :return:
     """
     config_json = json_reader(config_path)
     url = server_address + "/api/v1/model/train/"
     token = token_reader(token_file)
     headers = {
         "Content-Type": "application/json",
         "Accept": "application/json",
         "Authorization": f"Bearer {token}",
     }
     print(f"Waiting for {url} ...")
     r = requests.post(url, headers=headers, data=config_json)
     if r.status_code == 200:
         job_id = r.json()["job_id"]
         print(f"The job is successfully initiated, job_id= {job_id}")
         return job_id
     else:
         print(f"ERROR! ({r.status_code})")
         return None
예제 #2
0
def read_train(filename, stem=False, bigram=False):
    global doc_data, cnt, size_voc
    count = [0] * 5
    cnt = 0
    for i in range(5):
        class_dict[i][0] = Counter()
    for doc in utils.json_reader(filename):
        #txt = word_tokenize(doc["text"].lower())
        txt = re.findall(r"[\w']+", doc["text"].lower())
        #txt = doc["text"].split(" ")
        if stem == True:
            s = " "
            txt = utils.getStemmedDocuments(s.join(txt))
            txt = [item for item in txt if not item.isdigit()]
        if bigram == True:
            txt = (list(nltk.bigrams(txt)))
        vocab.update(txt)
        cnt = cnt + 1
        class_dict[int(doc["stars"]) - 1][0].update(txt)
        class_dict[int(doc["stars"]) - 1][2] += 1
        doc_data.append([doc["stars"], Counter(txt)])

    for i in range(5):
        class_dict[i][1] = sum(class_dict[i][0].values())
        print class_dict[i][1], class_dict[i][2]
    print "vocab"
    print len(vocab)
    size_voc = len(vocab)
예제 #3
0
def read_t(filename, stem=False, bigram=False):
    global doc_data
    for doc in utils.json_reader(filename):
        #txt = word_tokenize(doc["text"].lower())
        txt = re.findall(r"[\w']+", doc["text"].lower())
        #txt = doc["text"].split(" ")
        if stem == True:
            s = " "
            txt = utils.getStemmedDocuments(s.join(txt))
            txt = [item for item in txt if not item.isdigit()]
        if bigram == True:
            txt = (list(nltk.bigrams(txt)))
        doc_data.append([doc["stars"], Counter(txt)])
예제 #4
0
def set_model(train_file, model_file, preprocess_type='None', feature='None'):

    if os.path.exists(model_file):
        return

    docs = utility.json_reader(train_file)
    stars = np.zeros(5)
    category_count = np.zeros(5)

    class_frequency = {}
    count = 0
    all_words = []

    for doc in docs:
        count = count + 1
        if (count % 1000 == 0):
            print(count)
        words = text_processing(doc['text'], preprocess_type)
        words = feature_engg(words, feature)

        star = int(doc['stars'])
        stars[star - 1] += 1
        category_count[star - 1] += len(words)

        for word in words:
            if word not in class_frequency:
                all_words.append(word)
                class_frequency[word] = np.ones(5)
            class_frequency[word][star - 1] += 1

    #print(class_frequency)
    #print(all_words)

    m = count
    vocab_size = len(all_words)
    category_count += vocab_size

    for i in all_words:
        class_frequency[i] = np.log(class_frequency[i] / category_count)

    phai_y = np.log(stars / m)

    parameters = [class_frequency, phai_y, category_count]

    # open the file for writing
    obj_writer = open(model_file, 'wb')
    pickle.dump(parameters, obj_writer)
    obj_writer.close()
    print('done')
예제 #5
0
def data_loader(filename):
    start = time.time()
    X, Y = [], []
    data_gen = json_reader(filename)
    for i, sample in enumerate(data_gen):
        review = sample['text']
        # review = nltk.word_tokenize(review)
        stars = sample['stars']
        X.append(review)
        Y.append(stars)
        # if len(Y) == 5000:
        #     break
    df = pd.DataFrame({'text': X, 'stars': Y})
    print('Time taken = {}'.format(time.time() - start))
    return df
예제 #6
0
def post_reports_to_slack():
    global value
    message = ""
    url = "https://hooks.slack.com/services/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"

    test_report_file = os.path.abspath(
        os.path.join(os.path.dirname(__file__),
                     '.report.json'))  # Add report file name and address here

    summary_json = utils.json_reader(test_report_file)
    # print(summary_json)
    summary = str(summary_json["summary"])
    print(summary)

    if 'failed' in summary:
        bar_color = "#ff0000"
    else:
        bar_color = "#36a64f"

    try:
        slack_message = {
            'blocks': [{
                'type': 'section',
                'text': {
                    'type': 'mrkdwn',
                    'text': ':bomb:* Test Automation Result:*'
                }
            }],
            "attachments": [{
                "color": bar_color,
                "title": "Test Report",
                "text": summary
            }]
        }

        json_params_encoded = json.dumps(slack_message)
        requests.post(url=url,
                      data=json_params_encoded,
                      headers={"Content-type": "application/json"})

    except Exception as e:
        print(e)
예제 #7
0
def get_prediction(test_file,
                   model_file,
                   mode='None',
                   preprocess_type='None',
                   feature='None'):
    parameters = get_the_model(model_file)

    count = 0
    prob_dict = parameters[0]
    phai_y = parameters[1]
    category_count = parameters[2]
    print(len(prob_dict))

    docs = utility.json_reader(test_file)
    prediction = []
    original = []

    for doc in docs:
        if (count % 100000 == 0):
            print("iter:", count)
        count += 1

        if mode == 'b1':
            prediction.append(randint(1, 5))
        elif mode == 'b2':
            prediction.append(np.argmax(category_count) + 1)
        elif mode == 'a':
            words = text_processing(doc['text'], preprocess_type)
            words = feature_engg(words, feature)

            sum_of_logs = phai_y
            for word in words:
                if word not in prob_dict:
                    sum_of_logs = np.add(sum_of_logs,
                                         np.log(1 / category_count))
                else:
                    sum_of_logs = np.add(sum_of_logs, prob_dict[word])

            prediction.append(np.argmax(sum_of_logs) + 1)
        original.append(int(doc['stars']))

    return prediction, original
예제 #8
0
def main(train, test):	
	# test = "Stopped here today to give it a try and must admit the food was excellent"
	# bigram = nltk.bigrams(test.split())
	# print(list(map(''.join, bigram)))

	#Making Vocabulary for different labels out of training data
	vocab_list = [{}, {}, {}, {}, {}]
	vocabulary = {}
	vocab_list_bigrams = [{}, {}, {}, {}, {}]
	vocabulary_bigrams = {}
	#Count of each label in training data
	label_count = np.zeros(5)
	label_word_count = np.zeros(5)
	label_bigram_count = np.zeros(5)

	start1 = time.time()
##############################################################################
	#Training part
	iter = (ut.json_reader(train))
	# for i in range(TRAINFULLSIZE):
	i1=0
	for element in iter:
		i1+=1
		if (i1%1000)==0:
			print("Training: ", i1/1000)
		# for i in range(1):
		# element = next(iter)
		label_count[int(element["stars"])-1]+=1
		# print((remove_duplicates((element["text"]).split())))
		# label_word_count[int(element["stars"])-1]+= len((element["text"]).split())
		# Switch these lines for stemming
		stemmed = (element["text"].split())
		# stemmed = ut.getStemmedDocuments(element["text"]) 
		# bigram = nltk.bigrams(stemmed)
		# bigramlist = list(map(''.join, bigram))
		
		label_word_count[int(element["stars"])-1]+= len(stemmed)
		# label_bigram_count[int(element["stars"])-1]+= len(bigramlist)
		
		# stemmed.extend(bigramlist)
		# print(stemmed)
		for x in (stemmed):
		# for x in ((element["text"]).split()):
			word = x.strip(string.punctuation)
			# word = x
			# print(word)
			if word=="":
				continue
			if word in vocab_list[int(element["stars"]-1)]: 
				(vocab_list[int(element["stars"])-1])[word]+=1
			else:
				(vocab_list[int(element["stars"])-1])[word]=1
	
			vocabulary[word]=1

		# for x in (bigramlist):
		# # for x in ((element["text"]).split()):
		# 	word = x.strip(string.punctuation)
		# 	# word = x
		# 	# print(word)
		# 	if word=="":
		# 		continue
		# 	if word in vocab_list_bigrams[int(element["stars"]-1)]: 
		# 		(vocab_list_bigrams[int(element["stars"])-1])[word]+=1
		# 	else:
		# 		(vocab_list_bigrams[int(element["stars"])-1])[word]=1
	
		# 	vocabulary_bigrams[word]=1

##############################################################################

	end1 = time.time()
	print("Training done, Time taken(mins)", int(end1-start1)/60)

	# print(len(vocab))
	# count=0;
	# for i in range(5):
	# 	print(label_count[i])
	# 	count+=(label_count[i])
	# print(count)
	prior = label_count/TRAINSIZE
	# print(prior)
	
	actual_value = []
	predicted_value = []
	random_prediction = []
	start2 = time.time()
##############################################################################
	#TESTING
	i2=0
	iter2 = (ut.json_reader(test))
	for test_element in iter2:
		i2+=1
		if (i2%1000)==0:
			print("Testing: ", i2/1000)
		# print(i)
		#Random number between 1-5
		random_prediction.append(random.randint(1,6))
		# test_element = next(iter2)
		actual_value.append(int(test_element["stars"]))
		# test = "Stopped here today to give it a try and must admit the food was excellent. I ordered the vegetarian Soyrizo (fake sausage) burrito and fell in love. It was well worth the $6. It's not like the big chain restaurants where they serve you a massive sloppy burrito. It was the perfect size and easily handled. \nIt's small and quaint, with some seating outside in under a canopy. The owners were a lovely couple, passionate about their food. \nExcellent."
		# test = "Fast, easy, helpful. In and out quickly and got the medicine I needed. Smart staff who was kind and helpful. Clean facility. No complaints from me"	
		# test = "Service good, we had hummas, gyros, spiced date crumble.... all real good... need to try the flamming cheese next time!...  messed up on a few tables bill.. including ours but got it fixed.  I liked it. . .  my guest was on the fence."
		test = test_element["text"]
		test_list = ((test).split())
		# test_list = (ut.getStemmedDocuments(test_element["text"]))
		# bigram = nltk.bigrams(test_list)
		# bigramlist = list(map(''.join, bigram))
		# test_list.extend(bigramlist)
		# print(test_list)
		results = []
		for i in range(5):
		#check for 1 rating
		# i=0
			py = prior[i]
			logr = 0
			rating=i+1
			for x in test_list:
				word = x.strip(string.punctuation)
				# word = x
				# print(word)
				if word == "":
					continue
				if word in vocab_list[i]:
					# print(word)
					# print(((vocab_list[i])[word]))
					# print(label_count[i])
					probability = (((vocab_list[i])[word])+1)/(label_word_count[i]+len(vocabulary))
					logr+=math.log(probability)
				else:
					# print("not")
					logr+=math.log(1/(label_word_count[i]+len(vocabulary)))

			# for x in bigramlist:
			# 	word = x.strip(string.punctuation)
			# 	# word = x
			# 	# print(word)
			# 	if word == "":
			# 		continue
			# 	if word in vocab_list_bigrams[i]:
			# 		# print(word)
			# 		# print(((vocab_list[i])[word]))
			# 		# print(label_count[i])
			# 		probability = (((vocab_list_bigrams[i])[word])+1)/(label_bigram_count[i]+len(vocabulary_bigrams))
			# 		logr+=math.log(probability)
			# 	else:
			# 		# print("not")
			# 		logr+=math.log(1/(label_bigram_count[i]+len(vocabulary_bigrams)))
			results.append(logr+(math.log(py)))
			# print("------------------------------------------")
		
		predicted_value.append(results.index(max(results))+1)
		# print(results.index(max(results))+1)
##############################################################################

	# print(len(predicted_value))

	major = list(label_count).index(max(label_count))+1
	correct=0
	correct_random=0
	correct_major=0;
	# confusion =  np.zeros((5,5))
	# calc_f1_score = np.zeros(5)

	for i in range(len(predicted_value)):
		# print(predicted_value[i])
		if(predicted_value[i]==actual_value[i]):
			correct+=1
		if(random_prediction[i]==actual_value[i]):
			correct_random+=1
		if(major==actual_value[i]):
			correct_major+=1
		# confusion[predicted_value[i]-1][actual_value[i]-1]+=1
	
	# row_sum = np.sum(confusion, axis=1)
	# column_sum = np.sum(confusion, axis=0)
	# for i in range(5):
	# 	precision = confusion[i][i]/row_sum[i]
	# 	recall = confusion[i][i]/column_sum[i]
	# 	calc_f1_score[i] = 2*((precision*recall)/(precision+recall))
	
	end2 = time.time()
	print("Testing done, Time taken(mins)", int(end2-start2)/60)


	# print("Correct")
	# print(correct)
	# print(len(actual_value))
	print("Accuracy using Naive Bayes: ", int(correct/len(actual_value)*100) , "%")
	print("Accuracy using Random prediciton: ", int(correct_random/len(actual_value)*100) , "%")
	print("Accuracy using Majority prediciton: ", int(correct_major/len(actual_value)*100) , "%")
예제 #9
0
def main():
    # TODO: add logging

    # TODO: check the existence of input file(s)
    # TODO: read multiple csv files or call this script multiple times with given path to csv
    csv_path_taxi_trips = sys.argv[1]

    dir_root = os.path.dirname(os.path.realpath(__file__))

    # config json
    config_json_path = os.path.join(dir_root, "config.json")
    config_json = json_reader(config_json_path)

    if not config_json:
        print("no config json found or empty {}".format(config_json_path))
        return 1
    else:
        if 'db' not in config_json:
            print("db config not found or empty")
            return 1

    # database settings from config
    db_settings = config_json['db']
    db_hostname = db_settings['hostname']
    db_name = db_settings['db_name']
    db_schema = db_settings['schema']
    db_username = db_settings['username']

    # TODO: better to store DDLs separately and run before this job
    ddl_taxi_trips = """
            CREATE TABLE IF NOT EXISTS {0}.taxi_trips (    
                vendor_id               INTEGER,
                lpep_pickup_datetime    TIMESTAMP,
                lpep_dropoff_datetime   TIMESTAMP,
                store_and_fwd_flag      VARCHAR(1),
                ratecode_id             INTEGER,
                pulocation_id           INTEGER,
                dolocation_id           INTEGER,
                passenger_count         INTEGER,
                trip_distance           DECIMAL,
                fare_amount             DECIMAL,
                extra                   DECIMAL,
                mta_tax                 DECIMAL,
                tip_amount              DECIMAL,
                tolls_amount            DECIMAL,
                ehail_fee               DECIMAL,
                improvement_surcharge   DECIMAL,
                total_amount            DECIMAL,
                payment_type            INTEGER,
                trip_type               INTEGER,
                congestion_surcharge    DECIMAL,
                taxi_type               VARCHAR(50)
            );
        """.format(db_schema)

    try:
        conn = get_connection(hostname=db_hostname,
                              db=db_name,
                              username=db_username)
    except:
        print("Error in creating a connection.")
        return 1

    # for dev env
    conn.autocommit = True

    with conn.cursor() as cursor:
        # Create a taxi_trips table if not exists
        try:
            execute_ddl(cursor, ddl_taxi_trips)
        except:
            print("Error in creating taxi_trips table")
            # return 1

        # Insert the rows from taxi trips csv by reading using generator
        # TODO: insert only new rows (taxi_type, YYYY-MM)
        try:
            insert_taxi_trips(conn,
                              iter(
                                  rows_from_a_csv_file(csv_path_taxi_trips,
                                                       skip_first_line=True)),
                              page_size=1000)
        except:
            print("Error in inserting the data into taxi_trips table")
            return 1

    return 0
예제 #10
0
import sys
from datetime import datetime
import os.path
import ipdb
sys.path.append('../src/')

from web3 import Web3, HTTPProvider, TestRPCProvider, KeepAliveRPCProvider
from solc import compile_source
from web3.contract import ConciseContract
import etherscan.accounts as accounts


from utils import json_reader, parallel_dict_update, write_pickle, read_pickle

api_key = '3JS9BXYFNNGNX17WKANJMU63R6BQKJW5WE'
config = json_reader('config.json')
LR_ABI = config['LR_ABI']
LEDGER_ABI = config['LEDGER_ABI']
LEDGER_ADDRESS = config['LEDGER_ADDRESS']
node = config['node']
DECIMALS = 1e18


def timestamp_converter(timeStamp):
    timeStamp = int(timeStamp)
    return datetime.fromtimestamp(timeStamp).strftime('%Y-%m-%d %H:%M:%S')


def get_contract_time(contract_adress, api_key):
    api = accounts.Account(address=contract_adress, api_key=api_key)
    time = api.get_transaction_page(page=1, offset=1, internal=True)[
예제 #11
0
import pandas as pd
from utils.scraper import *
from utils.classifier import *
from utils.db_controller import *
from utils.json_reader import *

print('Process started... "Scraping profiles"')
scraper = Scraper()
print('Process ended... "Scraping profiles"')

print('Process started... "Reading JSON files"')
reader = json_reader()
posts, comments = reader.get_df()
print('Process ended... "Reading JSON FILES"')

print('Process started... "Classifying comments"')
classif = classifier(comments.copy())
comments = classif.get_df()
print('Process ended... "Classifying comments"')

print('Process started... "Inserting into DB"')
db = db_controller(posts, comments)
db.insert_into()
print('Process ended... "Inserting into DB"')

print('Cleaning the JSON files.')
reader.clean_files()
예제 #12
0
from clean.master import MasterCleaner
from utils import json_reader, simple_reader, simple_writer

if __name__ == "__main__":
    config = json_reader('cleaner_config.json')

    cleaner = MasterCleaner(config)

    target_data = simple_reader('dataset/test_data.txt')
    corpus = list()
    for line in target_data:
        new_line = cleaner.cleaning(line)
        corpus.append(new_line)
    simple_writer('output/test_data_cleaned.txt', corpus)

    # faster approach
    """
    writer = codecs.open('output/test_data_cleaned.txt', 'w', encoding='utf-8')
    target_data = simple_reader('dataset/test_data.txt')
    corpus = list()
    for line in target_data:
        new_line = cleaner.cleaning(line)
        writer.write(new_line + '\n')
    writer.close()
    """
예제 #13
0
def main():
    # TODO: check the existence of input file(s)
    csv_path_taxi_zone = sys.argv[1]

    dir_root = os.path.dirname(os.path.realpath(__file__))

    # config json
    config_json_path = os.path.join(dir_root, "config.json")
    config_json = json_reader(config_json_path)

    if not config_json:
        print("no config json found or empty {}".format(config_json_path))
        return 1
    else:
        if 'db' not in config_json:
            print("db config not found or empty")
            return 1

    # database settings from config
    db_settings = config_json['db']
    db_hostname = db_settings['hostname']
    db_name = db_settings['db_name']
    db_schema = db_settings['schema']
    db_username = db_settings['username']

    # TODO: better to store DDLs separately and run before this job
    ddl_taxi_zone = """
        CREATE TABLE IF NOT EXISTS {0}.taxi_zone (    
            location_id     INTEGER,
            borough         VARCHAR(255),
            zone            VARCHAR(255),
            service_zone    VARCHAR(255)
        );
    """.format(db_schema)

    # creating a connection to db
    try:
        conn = get_connection(hostname=db_hostname, db=db_name, username=db_username)
    except:
        print("Error in creating a connection.")
        return 1

    # for dev env
    conn.autocommit = True

    with conn.cursor() as cursor:
        # Create a taxi_zone table if not exists
        try:
            execute_ddl(cursor, ddl_taxi_zone)
        except:
            print("Error in creating taxi_zone table")
            # return 1

        # Insert into taxi_zone dictionary
        # TODO: insert only new rows
        try:
            insert_taxi_zone(
                conn,
                iter(rows_from_a_csv_file(csv_path_taxi_zone, skip_first_line=True)),
                page_size=1000
            )
        except:
            print("Error in inserting the data into taxi_zone table")
            return 1

    return 0