def downloadImages(dataset): print('Start reading features') with open(dataset) as f: allImgs = [] shapes = [] allResults = [] notProcessed = 0 totalImgs = 0 correctShape = 0 for row in csv.DictReader(f): if (float(row["likeRatio"]) > 1.): continue print(totalImgs) totalImgs += 1 try: image = imageProcess.Image(row["imgUrl"], True) imageShape = image.getImageShape() shapes.append(imageShape) # squaredImage = imageShape[0] == imageShape[1] # isRgb = imageShape[2] == 3; # if (not squaredImage) or (not isRgb): # continue # image_rescaled = rescale(image.skimageImage, RESIZE_FACTOR, anti_aliasing=False, multichannel=True) image_rescaled = resize(image.skimageImage, (TARGET_X, TARGET_Y),anti_aliasing=False) except Exception as e: notProcessed += 1 print(e) continue allImgs.append(image_rescaled) allResults.append(float(row["likeRatio"])) print("not processed: " + str(notProcessed/totalImgs)) slashIndex = dataset.find("/") slashIndex += 1 plt.figure() plt.plot(shapes) plt.title('image shape distribution') plt.ylabel('width') plt.xlabel('height') plt.savefig(f"datasets/{dataset[slashIndex:-4]}_distribution.png") np.save(f"allImgs_{dataset[slashIndex:-4]}.npy", allImgs) np.save(f"allResults_{dataset[slashIndex:-4]}.npy", allResults) return allImgs, allResults
def extractFeaturesFromDataset(filename): net = imageProcess.runFaceDetectDNN() print('Start reading features') with open(filename) as f: listFeatureVectorsWithResult = [] notProcessed = 0 for row in csv.DictReader(f): featureVector = defaultdict(float) imageNotProcessed = False for key in row: # each row is a dict if (key == "timestamp"): hourOfDay = datetime.fromtimestamp(int(row[key])).hour between2and6 = (hourOfDay >= 2 and hourOfDay < 6) between6and10 = (hourOfDay >= 6 and hourOfDay < 10) between10and14 = (hourOfDay >= 10 and hourOfDay < 14) between14and18 = (hourOfDay >= 14 and hourOfDay < 18) between18and22 = (hourOfDay >= 18 and hourOfDay < 22) between22and2 = (hourOfDay >= 22) or (hourOfDay < 2) featureVector['between2and6'] = between2and6 featureVector['between6and10'] = between6and10 # featureVector['between10and14'] = between10and14 featureVector['between14and18'] = between14and18 featureVector['between18and22'] = between18and22 featureVector['between22and2'] = between22and2 if (key == "caption"): # featureVector["captionLength"] = (len(row[key])) featureVector["capContainsFood"] = 1 if "food" in row[ key].lower() else 0 featureVector["capContainsFollow"] = 1 if "follow" in row[ key].lower() else 0 featureVector[ "capContainsAd"] = 1 if "ad" in row[key].lower() else 0 # if key == "hashtags": # hashtags = ast.literal_eval(row[key]) # hashtags = [n.strip() for n in hashtags] # featureVector["numHash"] = 1 if len(hashtags) == 0 else 1./len(hashtags) if key == "imgUrl": try: image = imageProcess.Image(row[key], True) except Exception as e: print(e) imageNotProcessed = True break # featureVector["colourfulness"] = imageProcess.extractSectorsFeature(image, 20, 20) faceInfo = imageProcess.extractFaceInfo(image, net) # featureVector["numFaces"] = imageProcess.extractNumFaces(faceInfo) featureVector[ "percentageFaces"] = imageProcess.extractTotalPercentAreaFaces( faceInfo) if (key == "likeRatio" or key == "likeCount" or key == "commentCount" or key == "timestamp"): continue # this should fail all the time we have a string as the value feature # probably bad style but python has no better way to check if # a string contains a float or not try: val = float(row[key]) featureVector[key] = val except: continue if (imageNotProcessed): notProcessed += 1 continue label = float(row["likeRatio"]) listFeatureVectorsWithResult.append((featureVector, label)) print('Finished extracting features') limitLen = 2 * int(len(listFeatureVectorsWithResult) / 3) trainData = listFeatureVectorsWithResult[:limitLen] testData = listFeatureVectorsWithResult[limitLen:] print( f"Not processed ratio: {notProcessed/len(listFeatureVectorsWithResult)}" ) # plusOneCount = 0 # minusOneCount = 0 # for data in trainData: # if data[1] == 1: plusOneCount+=1 # else: minusOneCount+=1 # for data in testData: # if data[1] == 1: plusOneCount+=1 # else: minusOneCount+=1 # print(plusOneCount) # print(minusOneCount) # print(plusOneCount/(plusOneCount+minusOneCount)) return (trainData, testData)
def extractFeaturesFromDataset(filename): print("PELE MEJOR QUE MARADONA!") # net = imageProcess.runFaceDetectDNN() print('Start reading features') with open(filename) as f: featureVectors = [] results = [] allImgs = [] allResults = [] shapes = [] notProcessed = 0 totalImgs = 0 correctShape = 0 a = True for row in csv.DictReader(f): if a: print(row.keys()) a = False print(totalImgs) totalImgs += 1 featureVector = [] somethingFailed = False for key in row: # each row is a dict try: if (key == "timestamp"): hourOfDay = datetime.fromtimestamp(int(row[key])).hour between2and6 = (hourOfDay >= 2 and hourOfDay < 6) between6and10 = (hourOfDay >= 6 and hourOfDay < 10) between10and14 = (hourOfDay >= 10 and hourOfDay < 14) between14and18 = (hourOfDay >= 14 and hourOfDay < 18) between18and22 = (hourOfDay >= 18 and hourOfDay < 22) between22and2 = (hourOfDay >= 22) or (hourOfDay < 2) featureVector.append(int(between2and6)) featureVector.append(int(between6and10)) featureVector.append(int(between10and14)) featureVector.append(int(between14and18)) featureVector.append(int(between18and22)) featureVector.append(int(between22and2)) dayOfWeek = ep_to_day(int(row[key])) if dayOfWeek == "Sunday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Monday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Tuesday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Wednesday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Thursday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Friday": featureVector.append(1) else: featureVector.append(0) if dayOfWeek == "Saturday": featureVector.append(1) else: featureVector.append(0) elif (key == "accessibilityCaption"): accessibilityCaption = row[key] if "people" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "and" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "one" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "or" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "more" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "standing" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "nature" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "closeup" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "sitting" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "tree" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "photo" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "no" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "description" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "available" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "cloud" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "beard" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "mountain" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "child" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "playing" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "sports" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "sunglasses" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "on" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "grass" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "suit" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "selfie" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "crowd" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "1" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "person" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "wedding" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) if "baby" in accessibilityCaption: featureVector.append(1) else: featureVector.append(0) elif key == "imgUrl": image = imageProcess.Image(row[key], True) imageShape = image.getImageShape() shapes.append((imageShape[0], imageShape[1])) # squaredImage = imageShape[0] == imageShape[1] # isRgb = imageShape[2] == 3; # if (not squaredImage) or (not isRgb): # continue # image_rescaled = rescale(image.skimageImage, RESIZE_FACTOR, anti_aliasing=False, multichannel=True) image_rescaled = resize(image.skimageImage, (TARGET_X, TARGET_Y),anti_aliasing=False) # featureVector.append(imageProcess.extractSectorsFeature(image, 20, 20)) # faceInfo = imageProcess.extractFaceInfo(image, net) # featureVector.append(imageProcess.extractNumFaces(faceInfo)) # featureVector.append(imageProcess.extractTotalPercentAreaFaces(faceInfo)) elif key == "likeRatio": # we will append the result at the end continue #allResults.append(float(row[key])) elif (key == "likeCount" or key == "commentCount"): continue featureVector.append(row[key]) elif (key == "isBusinessAcc"): featureVector.append(int(row[key]=="True")) elif (key == "isVerified"): featureVector.append(int(row[key]=="True")) elif (key == "hasChannel"): featureVector.append(int(row[key]=="True")) # this should fail all the time we have a string as the value feature # probably bad style but python has no better way to check if # a string contains a float or not else: try: val = float(row[key]) featureVector[key] = val except Exception as e: continue except Exception as e: somethingFailed = True notProcessed += 1 print(e) break if (somethingFailed): continue label = float(row["likeCount"])/float(row["userAverageLikes"]) allResults.append(label) allImgs.append(image_rescaled) featureVectors.append(featureVector) slashIndex = filename.find("/") slashIndex += 1 featureVectors = np.array(featureVectors) allResults = np.array(allResults) allImgs = np.array(allImgs) plt.figure() plt.title('image shape distribution') plt.ylabel('width') plt.xlabel('height') plt.scatter(*zip(*shapes)) plt.savefig(f"datasets/{filename[slashIndex:-4]}_distribution.png") np.save(f"allImgs_{filename[slashIndex:-4]}.npy", allImgs) np.save(f"allResults_{filename[slashIndex:-4]}.npy", allResults) np.save(f"featureVectors_{filename[slashIndex:-4]}.npy", featureVectors) return allImgs, featureVectors, allResults
import imageprocess as imageProcess import csv from skimage.transform import rescale, resize, downscale_local_mean RESIZE_FACTOR = 0.25 print('Start reading features') with open('datasets/neuralnet-firstdataset.csv') as f: allImgs = [] allResults = [] notProcessed = 0 totalImgs = 0 correctShape = 0 for row in csv.DictReader(f): totalImgs += 1 try: image = imageProcess.Image(row["imgUrl"], True) if image.getImageShape() != (1080, 1080, 3): continue image_rescaled = rescale(image.skimageImage, RESIZE_FACTOR, anti_aliasing=False, multichannel=True) correctShape += 1 except Exception as e: notProcessed += 1 continue allImgs.append(image_rescaled) allResults.append(float(row["likeRatio"])) print(f"not processed: {notProcessed/totalImgs}") print(f"correct shape total: {correctShape}") print(f"correct shape ratio: {correctShape/totalImgs}")