import feather import glob import sys import time import os import gc from multiprocessing import Pool from PIL import Image from collections import Counter import libavito as a print(a.c.BOLD + 'Generating image info ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '2_image_info.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug root = config.images_root # Function to compute difference hash of image def DifferenceHash(img): theImage = Image.fromarray(img) # Convert the image to 8-bit grayscale. theImage = theImage.convert("L") # 8-bit grayscale # Squeeze it down to an 8x8 image.
return intersection_cardinality / float(union_cardinality) def ratio_of_matches(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) x_cardinality = len(x) if x_cardinality == 0: return -1.0 else: return intersection_cardinality / float(x_cardinality) print(a.c.BOLD + 'Extracting set3b title features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3b_title.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') train = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2']] del df
return intersection_cardinality / float(union_cardinality) def ratio_of_matches(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) x_cardinality = len(x) if x_cardinality == 0: return -1.0 else: return intersection_cardinality / float(x_cardinality) print(a.c.BOLD + 'Extracting set3c JSON features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3c_json.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']] del df
s1 = str(row[d]) s2 = str(row[d + 1]) values.append(jellyfish.levenshtein_distance(s1, s2)) values.append(jellyfish.jaro_distance(s1, s2)) #values.append(float(jellyfish.damerau_levenshtein_distance(s1,s2)) ) values.append(fuzz.partial_ratio(s1, s2)) values.append(fuzz.token_set_ratio(s1, s2)) values.append(fuzz.ratio(s1, s2)) values.append(fuzz.token_sort_ratio(s1, s2)) return values print(a.c.BOLD + 'Extracting set4b fuzzy cleaned text features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set4b_fuzzy_clean.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') df = df[[ 'itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1',
import time import gc import random import sys import Levenshtein # pip install python-Levenshtein from haversine import haversine import libavito as a # Noise to add to variables to prevent overfitting, a value between +- the selected value will be added to every instance tot_lon_noise = 0.25 tot_lat_noise = 0.25 loc_dist_noise = 10 # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set2a_lev_loc.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: df = feather.read_dataframe(cache_loc + 'test.fthr') # Create dataframe for features x_all = pd.DataFrame() random.seed(2016)
# Not black magic, iterate over title/description/json for d in [2, 4, 6]: st_1 = str(row[d]) st_2 = str(row[d + 1]) values.append(fuzz.partial_ratio(st_1, st_2)) values.append(fuzz.token_set_ratio(st_1, st_2)) values.append(fuzz.ratio(st_1, st_2)) values.append(fuzz.token_sort_ratio(st_1, st_2)) return values print(a.c.BOLD + 'Extracting set4a fuzzy text features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set4a_fuzzy.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') df = df[[ 'itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1',
import sys import feather import time import gc from multiprocessing import Pool import libavito as a def debug(s): print(str(s)) time.sleep(1) print(a.c.BOLD + 'Extracting set3f image hamming features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3f_hamming.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc #debug = config.debug if mode == 0: df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: df = feather.read_dataframe(cache_loc + 'test.fthr') root = config.images_root image_db = feather.read_dataframe(cache_loc + 'image_database.fthr') df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
return intersection_cardinality / float(union_cardinality) def ratio_of_matches(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) x_cardinality = len(x) if x_cardinality == 0: return -1.0 else: return intersection_cardinality / float(x_cardinality) print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']] del df
return intersection_cardinality / float(union_cardinality) def ratio_of_matches(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) x_cardinality = len(x) if x_cardinality == 0: return -1.0 else: return intersection_cardinality / float(x_cardinality) print(a.c.BOLD + 'Extracting set3a description features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3a_description.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') train = df[['itemID_1', 'itemID_2', 'cleandesc_1', 'cleandesc_2']] del df
if where == 'ftr': print(str(e) + " FTR ERROR at " + str(x) + " !!!") if write_info is True: p = open(cache_loc + 'hist_errors_extract.txt', 'a') p.write('FTR ERROR at ' + str(x) + "\n") p.close() print(a.c.BOLD + 'Extracting set2c image histogram/hue features ...' + a.c.END) # Suppress expected warnings from pandas warnings.filterwarnings( "ignore", message='mean of empty slice|all-nan (axis|slice) encountered') # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set2c_hist.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') # Select columns required by script df = df[['images_array_1', 'images_array_2', 'itemID_1', 'itemID_2']]
def close(self): # Close the null files os.close(self.null_fds[0]) os.close(self.null_fds[1]) def suppress_pool_init(): # Open a pair of null files null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] # Assign the null pointers to stdout and stderr. os.dup2(null_fds[0], 1) os.dup2(null_fds[1], 2) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set2b_brisk.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') # Select columns required by script df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
counters = 0.0 for d in [catid, locid, metid, pcatid, regid]: if row[d] == row[d + 1] and row[d] not in ["NA", np.nan]: values.append(1.0) counters += 1.0 else: values.append(0.0) values.append(counters / 5.0) return values print(a.c.BOLD + 'Extracting set4d clean similarity features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set4d_similarity_clean.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') df = df[[ 'itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1',
######################### # Define cleaning parameters stopwords = get_stop_words('ru') exclude_cats = set([ 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sk', 'Sc', 'So', 'Co', 'Cf', 'Cc', 'Cs', 'Cn' ]) sno = nltk.stem.SnowballStemmer('russian') ######################### print(a.c.BOLD + 'Cleaning input data ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '1_data_preprocessing.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc category_loc = config.category_csv location_loc = config.location_csv debug = config.debug if mode == 0: data_loc = config.train_ItemInfo pairs_loc = config.train_ItemPairs if mode == 1: data_loc = config.test_ItemInfo pairs_loc = config.test_ItemPairs