def search(self, query, queryType): """ Searches the IR system for relevant documents Arguments: query - String query Returns: JSON formatted document output """ if queryType < 2: termList, queryMetadata = preProcess(query, queryType) # print(termList, queryMetadata) docList = self._index.getDocuments(termList, queryType, queryMetadata) # print(len(docList)) rankedDocList = [ self._vectorSpace.vectorSpaceRank(docList, termList) ] #output = generateOutput(rankedDocList) else: termList, queryMetadata = preProcess(query, queryType) docs = self._permutermIndex.getDocuments(termList, queryType, queryMetadata) rankedDocList = [] for docList, termList in docs: rdl = self._vectorSpace.vectorSpaceRank(docList, termList) rankedDocList.append(rdl) #output = generateOutput(rankedDocList) return rankedDocList
def imageManipulate(self): # Give the settings values on the image manipulation frame and make a target.png file, returns True if any edit was done self.is_edited = preProcess.preProcess(image_path, target_image_path, self.grayCheck.get(), self.Threashold_level.get()) if self.is_edited: self.showPreviewImage(target_image_path) else: self.showPreviewImage()
def main(): #read the config file config = yaml.safe_load(open("config.yml")) #fetch image filenames from image directory img_dir = config['img_path'] image_files = [f for f in listdir(img_dir) if isfile(join(img_dir, f))] image_files = [f for f in image_files if not f.startswith('.')] #Directory to store processed images img_processed_dir = config['img_processed_path'] #Directory to store processed text files img_text_dir = config['text_path'] for imgFile in image_files: imgPath = join(img_dir, imgFile) ImgProcPath = join(img_processed_dir, imgFile) #text output file path textOP = join(img_text_dir,imgFile.split('.')[0]+'.txt') if not isfile(textOP): print('Begin Processing ' + imgFile) #Image pre-processing preProcess(ImgProcPath,imgPath) # Extract text from pre-processed images extracted_str = image_to_string(Image.open(ImgProcPath), lang="eng", config="-psm 1") extracted_str = extracted_str.lower() print("OUTPUT:"+textOP) with open(textOP, 'w' , encoding='utf-8') as text_file: text_file.write(extracted_str) else: print(" Already exists:"+textOP) #print(extracted_str) #Call method to classify receipts classifyReceipt()
def main(): all=learn(r'C:\Users\hp\Desktop\code\code\hard\train') str1=input("Please input path:") with open('result.csv','w',newline='') as f: writer = csv.writer(f) writer.writerow(('name','code')) for home,dir,files in os.walk(str1): for filename in files: info = preProcess(os.path.join(home, filename)) answer='' for i in info: vote=[] #print(len(i)) distance=[] for j in all: distance.append(calDistance(j[0],i)) index=distance.index(min(distance)) vote.append([distance[index],all[index][1]]) distance[index]=1000 index=distance.index(min(distance)) vote.append([distance[index],all[index][1]]) distance[index]=1000 index=distance.index(min(distance)) vote.append([distance[index],all[index][1]]) distance[index]=1000 if(vote[0][1]==vote[1][1]): answer+=str(vote[0][1]) continue elif (vote[0][1]==vote[2][1]): answer+=str(vote[0][1]) continue elif (vote[1][1]==vote[2][1]): answer+=str(vote[1][1]) continue else: answer+=str(vote[0][1]) writer.writerow((filename,answer)) '''
def build(self): """ Reads files one-by-one and builds the index Arguments: None Returns: None """ if os.path.isfile("InvertedIndex"): print("Loading the Inverted Index from file") with open("InvertedIndex", "rb") as file: self._btree = pickle.load(file) else: print("Building the Inverted Index") dirPath = os.path.dirname(os.path.realpath(__file__)) dataPath = os.path.realpath(os.path.join(dirPath, "..", "data")) files = [ os.path.join(dataPath, file) for file in sorted(os.listdir(dataPath)) ] for file in files: snippets = getSnippets(file) for index, snippet in enumerate(snippets): filename = int(os.path.split(file)[1].split(".csv")[0]) docId = (filename, index + 2) tokens, snippetMetadata = preProcess(snippet) self.updateIndex(tokens, docId) self.sortPostingLists() sys.setrecursionlimit(10000) with open("InvertedIndex", "wb") as file: print("Saving the Inverted Index to file") pickle.dump(self._btree, file)
videoFile = '../assets/sample2.mp4' frameRate = 30 Cats = [] Success = False for catIndex in range(5): filename = ds.genFileName(videoFile, catIndex) print('Loading ' + filename) meta, cat = ds.load(filename) if len(cat) > 0: Cats.append(cat) Success = True if not Success: Cats = pp.preProcess(videoFile, 5, [5, 4, 3, 2, 1], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5], 2, 1) print('Saving data...') ds.dump(videoFile, Cats) buff = vb.Buff(10000) for cat in Cats: for msg in cat: buff.write(msg) buff.finished = True cv2.namedWindow("Nextflix") #------------------------------------------------------------------------------------ frameList = []
def rankDocs(query, index_collection=index_collection): query = preProcess( query) #preProcess query the same way as documents during indexing if query == []: return None mongo_query = [] for token in query: mongo_query.append({"word": token}) docs_cursor = index_collection.find({"$or": mongo_query}) #sort docs retrieved from db in the order of the query ie query = q1,q2,q3 and query_results = [d1,d2,d3] keywordstodoc = {} for doc in docs_cursor: keywordstodoc[doc["word"]] = (doc["docList"], doc["idf"]) docptrs = [[0, False] for i in range(len(query))] isEmpty = 0 for i in range(len(query)): if query[i] not in list(keywordstodoc.keys()): docptrs[i] = [0, True] keywordstodoc[query[i]] = None isEmpty += 1 if isEmpty == len(query): print( "Could not find any documents for your query please check spelling or rephrase your search" ) return None minHeap = MinHeap(len(query) + 1, param=1, type='ObjectId') isTraversed = False for i in range(len(query)): if docptrs[i][1] == False: doctoinsert = keywordstodoc[query[i]][0][ 0] #insert first document corresponding to every keyword in query into the minHeap minHeap.insert((i, doctoinsert["docid"])) minHeap.minHeap() minDoc = minHeap.min() #smallest document in the heap scoreDoc = [ [minDoc[0], docptrs[minDoc[0]][0]] ] #list of the common documents, each entry in the list contains a list with query number and the corresponding position of the document in the doclist scores = [] max_substrings = { } #dictionary mapping each substring present in every document to the maximum number of times it occurs over all documents,to be used for scoring docs while isTraversed == False: query_number = minDoc[ 0] #find the query associated with the smallest document smallest_query = query[query_number] docptrs[query_number][ 0] += 1 #increment pointer to the docList associated with that query docptr = docptrs[query_number][0] if docptr == len(keywordstodoc[query[query_number]][0]): docptrs[query_number][ 1] = True #if docptr associated with the query has reached the end of the docList set flag to True minHeap.remove( ) #remove the root of the minHeap and heapify #check if pointer has reached the end of the doclist of the query#replace root of minHeap with the next doc in the doclist else: minHeap.removeandreplace( (query_number, keywordstodoc[smallest_query][0][docptr]["docid"])) newminDoc = minHeap.min( ) # get the new smallest document in the heap after heapifying if minDoc[1] == newminDoc[ 1]: #check if the smallest document is the same as the previous smallest scoreDoc.append( [newminDoc[0], docptrs[newminDoc[0]][0]] ) #if true append the query number and the index of the smallest document in that doclist to scoreDoc minDoc = newminDoc else: score = getScore( scoreDoc, query, keywordstodoc ) #if the smallest document has changed, score the document scores.append(score) minDoc = newminDoc scoreDoc = [[ newminDoc[0], docptrs[minDoc[0]][0] ]] #create new scoreDoc list as the smallest document has changed query_counter = 0 for ptr in docptrs: #check if all the doclists have been fully traversed if ptr[1] == True: query_counter += 1 if query_counter == len(query): isTraversed = True return sort_tuple(scores)
available_features = list(x) if self.tree_built: pres_node = self.root while pres_node.definiteLabel == None: attribute = pres_node.attribute if pres_node.children[available_features[attribute] - 1]: pres_node = pres_node.children[ available_features[attribute] - 1] available_features.pop(attribute) else: return pres_node.labelSamples return pres_node.definiteLabel if __name__ == '__main__': preProcessData = preProcess() preProcessData.handle_missing_values() preProcessData.handle_highly_correlated_features() df = preProcessData.return_df() DT = decisionTree() kFold = KFold(6, True, 1) values = df.values for train, test in kFold.split(values): print("Taking %d train datapoints" % len(train)) train_x = values[train] test_x, test_y = values[test][:, :-1], values[test][:, -1] DT.fit(train_x) pred_y = DT.predict(test_x) results = testResults(pred_y, test_y) print("Accuracy of model is ", results.return_accuracy()) print('F Score of model is ', results.return_fscore())
processes = [] #Array of current processes subprocesses = [] #Array of subproccesses #-------------------------------------------------------------------- ################## Timer ########################## if config.TIME_RUN: print("Start Timing") start_time = time.time() ################### Pre-Process ################### if config.PRE_PROCESS: print("Pre-Processing") pre.preProcess(os, utils, random, config.DE_EFFECT, userMovieRating, config.TEST_SUBSET, config.PROCESS_TAGS, config.PROCESS_SOCIAL, config.PROCESS_HISTORY, processes, mproc) print("Pre-Processing Complete") ################### Setup Models ################### if config.SETUP_MODELS: setupModels.setupModels(sys, os, utils, config, random, mproc, modelList) ################# Run Models ################### if config.RUN_MODELS: runModels.runModels(sproc, modelList, testPredictionPaths, CVPredictionPaths, config.TRIALS, RMSEPaths, False) #### Fix #####
processes = [] #Array of current processes subprocesses = [] #Array of subproccesses #-------------------------------------------------------------------- ################## Timer ########################## if config.TIME_RUN: print("Start Timing") start_time = time.time() ################### Pre-Process ################### if config.PRE_PROCESS: print("Pre-Processing") pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating,\ config.LAPTOP_TEST,config.PROCESS_MOVIE_TAG,\ config.PROCESS_SOCIAL,config.PROCESS_HISTORY) print("Pre-Processing Complete") ################### Setup Models ################### if config.SETUP_MODELS: setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) ################# Run Models ################### if config.RUN_MODELS: runModels.runModels(sproc,modelList, testPredictionPaths,CVPredictionPaths, config.TRIALS,RMSEPaths,False)
from scipy.sparse import bsr_matrix from preProcess import preProcess from loadSparseData import loadSparseData from PA import PA fname1 = 'pu1' fname2 = 'pu2' fname3 = 'pu3' fname4 = 'pua' X1, Y1 = loadSparseData(fname1) X2, Y2 = loadSparseData(fname2) X3, Y3 = loadSparseData(fname3) X4, Y4 = loadSparseData(fname4) print 'preProcess X' X1 = preProcess(X1) X2 = preProcess(X2) X3 = preProcess(X3) X4 = preProcess(X4) print 'start COML' def update_CM(i, X, Y, w, u): x_t = X[i] f_t = dot(w, x_t) loss_w = max(0, 1 - Y[i] * f_t) if loss_w == 0: tau = 0 else: tau = min(C, (fai1+fai2-dot(Y[i]*(fai1*w+fai2*u), x_t)/linalg.norm(x_t))) w = fai1 * w + fai2 * u + tau * Y[i] * x_t.reshape(1, len(x_t))
def main(): testMode = True #set this value to False if you want to recover data from disk videoFile = '../assets/chaplinCut.mp4' grayScale = True frameRate = 25 #the gap of frames that will be waited each time the video stops. #increase this value for the video to be more fluid framesBeforeStart = 30 #The maximum number of objects that can wait for new data #increase this value if you want a better quality receiveWindow = 30 #reading the pre-processed data Cats = [] Success = False if not testMode: for catIndex in range(2): filename = ds.genFileName(videoFile, catIndex) meta, cat = ds.load(filename) if len(cat) > 0: Cats.append(cat) Success = True #if id does not exists, create it if not Success: nCats = 2 msgRedundancies = [0, 0] exp = 2 fixed = 1 msgPeriods = [180, 90] msgSize = [1, 2] Cats = pp.preProcess(videoFile,nCats,msgPeriods,\ msgRedundancies,msgSize,exp,fixed,grayScale) #Cats = pp.preProcess(videoFile,2,[600,480,360,240,120],[0,0,0,0,0],\ #[1,1.2,1.4,1.6,1.8],2,1,grayScale) # Cats = pp.preProcess(videoFile,5,[5,4,3,2,1],[0,0,0,0,0],[1,2,3,4,5],2,1) if not testMode: print('Saving data...') ds.dump(videoFile, Cats) #the video buffer buff = vb.Buff(10000, len(Cats[0][0].channel)) else: #the video buffer buff = vb.Buff(10000, meta.nChannels) #the messages arrive for cat in Cats: for msg in cat: buff.write(msg) buff.finished = True vc.startDisplayMechanism(framesBeforeStart, receiveWindow, buff, frameRate) while buff.getCode_displayer() != -1: pass print('Exiting application')
def selection(thresh = 2, lenThreshRatio = 1.0/10): verLines = self.verLines horLines = self.horLines if __name__ == '__main__': for index in range(2,6): i = str(index) filePath = "images/tree"+ i +"_wo.jpg" image = cv.imread(filePath,0) plt.imshow(image, cmap='Greys_r') plt.show() p = preProcess(image) p.bolding() p.displayImage() oriImage = p.returnImage() a = scaleCornerDetection(oriImage) a.parsing() # for index in range(1,5): # filePath = "images/tree%d_wo.jpg" %index # print filePath # image = cv.imread(filePath,0) # a = cornerDetection(image) # cv.imwrite('images/tree%d_corner.jpg', a, [cv.IMWRITE_JPEG_QUALITY, 100]) # image = cv.imread("images/tree3.jpg", 0)
#where CV predictions are saved processes = [] #Array of current processes subprocesses = [] #Array of subproccesses #-------------------------------------------------------------------- ################## Timer ########################## if config.TIME_RUN: print("Start Timing") start_time = time.time() ################### Pre-Process ################### if config.PRE_PROCESS: print("Pre-Processing") pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating) print("Pre-Processing Complete") ################### Setup Models ################### if config.SETUP_MODELS: setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) ################# Run Models ################### if config.RUN_MODELS: runModels.runModels(sproc,modelList, testPredictionPaths,CVPredictionPaths, config.TRIALS,RMSEPaths,False) #### Fix #####
processes = [] #Array of current processes subprocesses = [] #Array of subproccesses #-------------------------------------------------------------------- ################## Timer ########################## if config.TIME_RUN: print("Start Timing") start_time = time.time() ################### Pre-Process ################### if config.PRE_PROCESS: print("Pre-Processing") pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating, config.TEST_SUBSET,config.PROCESS_TAGS, config.PROCESS_SOCIAL, config.PROCESS_HISTORY,processes,mproc) print("Pre-Processing Complete") ################### Setup Models ################### if config.SETUP_MODELS: setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) ################# Run Models ################### if config.RUN_MODELS: runModels.runModels(sproc,modelList, testPredictionPaths,CVPredictionPaths, config.TRIALS,RMSEPaths,False)
''' df = formatPreds(dataDict, preds) errors = evaluate(df, ensemble=False) mape = errors["NN"]["mape"] rmse = errors["NN"]["rmse"] print("{:<10.1%}{:.2f}".format(mape, rmse)) return (mape, rmse) if __name__ == "__main__": args = getArgs() config = getConfig() df = getData(config) df = df.sample(frac=0.3) dataDict = preProcess(df, config, args) parms = getParms("NN") # The hyper-parameter combinations to be tested results = [] count = 1 start_time = time.time() print("\n{} parameter combinations".format(len(parms))) print("\n{:<10}{}".format("MAPE", "RMSE")) for x in parms: parmDict = loadParms(x) preds = runNN(dataDict, parmDict, config)
def make_index(self): print("Building Index....") for doc in self.docs: self.docCount += 1 id = doc['_id'] print("Processing URL for doc {}".format(id)) url = preProcessUrl(doc['url']) print("Processing transcript for doc {}".format(id)) transcript = preProcess(doc['transcript']) for i in range(len(transcript)): word = transcript[i] if (word != ''): if word not in self.index.keys( ): #word has not been seen in any document self.index[word] = { "word": word, "idf": 1, "docList": [{ 'docid': id, 'body': [i], 'tf': 1, 'url': [] }] } elif id != self.index[word]["docList"][-1][ 'docid']: #word has not been seen in the current document self.index[word]["docList"].append({ 'docid': id, 'body': [i], 'tf': 1, 'url': [] }) self.index[word]["idf"] += 1 else: #word has been seen before in the document self.index[word]["docList"][-1]['body'].append( i ) #append position of the word in the doc in the index self.index[word]["docList"][-1]['tf'] += 1 for i in range(len(url)): word = url[i] if word != '': if word not in self.index.keys( ): #word has not been seen in any document self.index[word] = { "word": word, "idf": 1, "docList": [{ 'docid': id, 'body': [], 'tf': 1, 'url': [i] }] } elif id != self.index[word]["docList"][-1][ 'docid']: #word has not been seen in current document self.index[word]["docList"].append({ 'docid': id, 'body': [], 'tf': 1, 'url': [i] }) self.index[word]["idf"] += 1 else: #word has been seen before in the document self.index[word]["docList"][-1]['url'].append(i) self.index[word]["docList"][-1]['tf'] += 1 wordCount = 0 for word in self.index.keys(): wordCount += 1 self.index[word]["idf"] = math.log(self.docCount / (self.index[word]["idf"] + 1)) #print(word + '....{}'.format(self.index[word]["idf"])) print("Index with {} words successfully made".format(wordCount))