예제 #1
0
def clean_media_type(clean_table):
    media_type = "media_type"
    query = "SELECT id, {0} FROM {1}".format(media_type, clean_table)

    rows = database_handler.select(query)

    gt = ["DVD","BLURAY","VHS"]
    gt_dict = dict()
    gt_dict[0] = gt[0]
    gt_dict[1] = gt[1]
    gt_dict[2] = gt[2]

    for row in rows:
        media_seq = get_seq_from_row(row, ["media_type"]) 

        best_matching_index = ""
        best_matching_type = "" 

        if not media_seq:
            best_matching_type = ""
        else:
            best_matching_index = string_matcher.get_matching_seq(media_seq, gt_dict)
            best_matching_type = gt_dict[best_matching_index]
            #print ' {0} {1} => {2} '.format(row["id"],row["media_type"],best_matching_type)
            update_query = "UPDATE {0} SET {1}='{2}' WHERE id={3}".format(clean_table, media_type, best_matching_type, row["id"])
예제 #2
0
def clean_media_type_parallel(gt_dict,rows,clean_table):
	for row in rows:
		best_matching_index = string_matcher.get_matching_seq(row["media_type"],gt_dict)
		best_matching_type = gt_dict[best_matching_index]
		query = "UPDATE {0} SET media_type='{1}' WHERE id={2}".format(clean_table,best_matching_type,row["id"])
		#print " {0} {1} => {2} ".format(row["id"],row["media_type"],best_matching_type)
		database_handler.update(query)
예제 #3
0
def clean_movie_in_parallel(gt_seq_dict,gt_rows,rows,clean_table,title_attr,genre_attr):
	for row in rows:
		mv_seq = ''.join(row[col] for col in ["movie_title", "movie_genre"])
		best_matching_index = string_matcher.get_matching_seq(mv_seq, gt_seq_dict)
		best_row = gt_rows[best_matching_index]
		title_update = best_row["title"]
		genre_update = best_row["genre"]
		#print " {0} {1} => {2} {3} ".format(row["movie_title"],row["movie_genre"],title_update,genre_update)
		clean_update = "UPDATE {0} SET {1}='{2}', {3}='{4}' WHERE id={5}".format(clean_table, title_attr, title_update, genre_attr, genre_update, row["id"])
		database_handler.update(clean_update)
예제 #4
0
def clean_customer_in_parallel(gt_table,rows,clean_table):
	for row in rows:
		gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(gt_table, row["customer_birthday"])
		gt_rows = database_handler.select(gt_query)
		cus_seq = ''.join(row[col] for col in ["customer_firstname", "customer_lastname", "customer_street", "customer_gender"])
		gt_seq_dict = dict()
		i = 0
		for gt_row in gt_rows:
			gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street","gender"])
			i += 1
		best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict)
		best_matching_row = gt_rows[best_matching_index]
		firstname = best_matching_row["firstname"].replace("'", "''")
		lastname = best_matching_row["lastname"].replace("'", "''")
		street = best_matching_row["street"].replace("'", "''")
		gender = best_matching_row["gender"].replace("'", "''")
		#print '{0} {1} {2} {3} => {4} {5} {6} {7}'.format(row['customer_firstname'],row['customer_lastname'],row['customer_street'],row['customer_gender'],firstname,lastname,street,gender)
		update_query = "UPDATE {0} SET customer_firstname='{1}', customer_lastname='{2}', customer_street='{3}', customer_gender='{4}', customer_city='NEW YORK' WHERE id={5}".format(clean_table,firstname,lastname,street,gender,row["id"])
		database_handler.update(update_query)
예제 #5
0
def clean_customer_in_parallel(groundtruth,rows,customer):
	for row in rows:
			gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(groundtruth, row["birthday"])
			groundtruth_rows = database_handler.select(gt_query)
			# avoid function call: replace get_seq_from_row with:
			cus_seq = ''.join(row[col] for col in ["name", "street", "gender"])
			gt_seq_dict = dict()
			i = 0
			for gt_row in groundtruth_rows:
				# avoid function call: replace get_seq_from_row with:
				gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street", "gender"]) 
				i += 1
			best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict)
			best_matching_row = groundtruth_rows[best_matching_index]
			name = ' '.join([best_matching_row["firstname"],best_matching_row["lastname"]])
			name = name.replace("'", "''")
			street = best_matching_row["street"]
			street = street.replace("'", "''")
			gender = best_matching_row["gender"]
			gender = gender.replace("'","''")
			update_query = "UPDATE {0} SET name='{1}', street='{2}', city='{3}',gender='{4}' WHERE id={5}".format(customer, name, street,"NEW YORK", gender, row["id"])
			database_handler.update(update_query)
예제 #6
0
def clean_movie_genre_in_parallel(gt_genres_dict,rows,clean_table):
	for row in rows: 
		best_matching_index = string_matcher.get_matching_seq(row["movie_genre"], gt_genres_dict)
		best_matching_genre = gt_genres_dict[best_matching_index]
		#print " {0} {1} => {2} ".format(row["id"],row["movie_genre"],best_matching_genre)
		database_handler.update("UPDATE {0} SET movie_genre='{1}' WHERE id={2}".format(clean_table,best_matching_genre,row["id"]))