예제 #1
0
    def search(self, query, queryType):
        """
            Searches the IR system for relevant documents

            Arguments:
            query - String query

            Returns:
            JSON formatted document output
        """

        if queryType < 2:
            termList, queryMetadata = preProcess(query, queryType)
            # print(termList, queryMetadata)
            docList = self._index.getDocuments(termList, queryType,
                                               queryMetadata)
            # print(len(docList))
            rankedDocList = [
                self._vectorSpace.vectorSpaceRank(docList, termList)
            ]
            #output = generateOutput(rankedDocList)

        else:
            termList, queryMetadata = preProcess(query, queryType)
            docs = self._permutermIndex.getDocuments(termList, queryType,
                                                     queryMetadata)

            rankedDocList = []
            for docList, termList in docs:
                rdl = self._vectorSpace.vectorSpaceRank(docList, termList)
                rankedDocList.append(rdl)
            #output = generateOutput(rankedDocList)

        return rankedDocList
예제 #2
0
    def imageManipulate(self):
        # Give the settings values on the image manipulation frame and make a target.png file, returns True if any edit was done
        self.is_edited = preProcess.preProcess(image_path, target_image_path,
                                               self.grayCheck.get(),
                                               self.Threashold_level.get())

        if self.is_edited:
            self.showPreviewImage(target_image_path)
        else:
            self.showPreviewImage()
예제 #3
0
def main():
    #read the config file
    config = yaml.safe_load(open("config.yml"))
    
    #fetch image filenames from image directory
    img_dir = config['img_path']
    image_files = [f for f in listdir(img_dir) if isfile(join(img_dir, f))]
    image_files = [f for f in image_files if not f.startswith('.')]
    
    #Directory to store processed images
    img_processed_dir = config['img_processed_path']
    
    #Directory to store processed text files
    img_text_dir = config['text_path']
	
    for imgFile in image_files:
        imgPath = join(img_dir, imgFile)
        ImgProcPath = join(img_processed_dir, imgFile)
        
		#text output file path
        textOP = join(img_text_dir,imgFile.split('.')[0]+'.txt')
        if not isfile(textOP):
            print('Begin Processing ' + imgFile)
			
			#Image pre-processing
            preProcess(ImgProcPath,imgPath)
			
			# Extract text from pre-processed images
            extracted_str = image_to_string(Image.open(ImgProcPath), lang="eng", config="-psm 1")
            extracted_str = extracted_str.lower()
                      
            
            print("OUTPUT:"+textOP)
            with open(textOP, 'w' , encoding='utf-8') as text_file:
                text_file.write(extracted_str)
        else:
            print(" Already exists:"+textOP)
        #print(extracted_str)
    
    #Call method to classify receipts
    classifyReceipt()
예제 #4
0
def main():
    all=learn(r'C:\Users\hp\Desktop\code\code\hard\train')
    str1=input("Please input path:")
    with open('result.csv','w',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(('name','code'))
        for home,dir,files in os.walk(str1):
            for filename in files:
                info = preProcess(os.path.join(home, filename))
                answer=''
                for i in info:
                    vote=[]
                    #print(len(i))
                    distance=[]
                    for j in all:
                        distance.append(calDistance(j[0],i))
                    index=distance.index(min(distance))
                    vote.append([distance[index],all[index][1]])
                    distance[index]=1000
                    index=distance.index(min(distance))
                    vote.append([distance[index],all[index][1]])
                    distance[index]=1000
                    index=distance.index(min(distance))
                    vote.append([distance[index],all[index][1]])
                    distance[index]=1000
                    if(vote[0][1]==vote[1][1]):
                        answer+=str(vote[0][1])
                        continue
                    elif (vote[0][1]==vote[2][1]):
                        answer+=str(vote[0][1])
                        continue
                    elif (vote[1][1]==vote[2][1]):
                        answer+=str(vote[1][1])
                        continue
                    else:
                        answer+=str(vote[0][1])
                writer.writerow((filename,answer))
                

    




    '''
예제 #5
0
    def build(self):
        """ 
            Reads files one-by-one and builds the index

            Arguments:
            None

            Returns:
            None
        """

        if os.path.isfile("InvertedIndex"):
            print("Loading the Inverted Index from file")
            with open("InvertedIndex", "rb") as file:
                self._btree = pickle.load(file)
        else:
            print("Building the Inverted Index")
            dirPath = os.path.dirname(os.path.realpath(__file__))
            dataPath = os.path.realpath(os.path.join(dirPath, "..", "data"))
            files = [
                os.path.join(dataPath, file)
                for file in sorted(os.listdir(dataPath))
            ]

            for file in files:
                snippets = getSnippets(file)
                for index, snippet in enumerate(snippets):
                    filename = int(os.path.split(file)[1].split(".csv")[0])
                    docId = (filename, index + 2)
                    tokens, snippetMetadata = preProcess(snippet)
                    self.updateIndex(tokens, docId)

            self.sortPostingLists()

            sys.setrecursionlimit(10000)
            with open("InvertedIndex", "wb") as file:
                print("Saving the Inverted Index to file")
                pickle.dump(self._btree, file)
예제 #6
0
videoFile = '../assets/sample2.mp4'

frameRate = 30

Cats = []
Success = False
for catIndex in range(5):
    filename = ds.genFileName(videoFile, catIndex)
    print('Loading ' + filename)
    meta, cat = ds.load(filename)
    if len(cat) > 0:
        Cats.append(cat)
        Success = True

if not Success:
    Cats = pp.preProcess(videoFile, 5, [5, 4, 3, 2, 1], [0, 0, 0, 0, 0],
                         [1, 2, 3, 4, 5], 2, 1)
    print('Saving data...')
    ds.dump(videoFile, Cats)

buff = vb.Buff(10000)

for cat in Cats:
    for msg in cat:
        buff.write(msg)

buff.finished = True

cv2.namedWindow("Nextflix")

#------------------------------------------------------------------------------------
frameList = []
예제 #7
0
def rankDocs(query, index_collection=index_collection):
    query = preProcess(
        query)  #preProcess query the same way as documents during indexing
    if query == []:
        return None
    mongo_query = []
    for token in query:
        mongo_query.append({"word": token})
    docs_cursor = index_collection.find({"$or": mongo_query})
    #sort docs retrieved from db in the order of the query ie query = q1,q2,q3 and query_results = [d1,d2,d3]
    keywordstodoc = {}
    for doc in docs_cursor:
        keywordstodoc[doc["word"]] = (doc["docList"], doc["idf"])
    docptrs = [[0, False] for i in range(len(query))]
    isEmpty = 0
    for i in range(len(query)):
        if query[i] not in list(keywordstodoc.keys()):
            docptrs[i] = [0, True]
            keywordstodoc[query[i]] = None
            isEmpty += 1
    if isEmpty == len(query):
        print(
            "Could not find any documents for your query please check spelling or rephrase your search"
        )
        return None
    minHeap = MinHeap(len(query) + 1, param=1, type='ObjectId')
    isTraversed = False

    for i in range(len(query)):
        if docptrs[i][1] == False:
            doctoinsert = keywordstodoc[query[i]][0][
                0]  #insert first document corresponding to every keyword in query into the minHeap
            minHeap.insert((i, doctoinsert["docid"]))
    minHeap.minHeap()
    minDoc = minHeap.min()  #smallest document in the heap
    scoreDoc = [
        [minDoc[0], docptrs[minDoc[0]][0]]
    ]  #list of the common documents, each entry in the list contains a list with query number and the corresponding position of the document in the doclist
    scores = []
    max_substrings = {
    }  #dictionary mapping each substring present in every document to the maximum number of times it occurs over all documents,to be used for scoring docs
    while isTraversed == False:
        query_number = minDoc[
            0]  #find the query associated with the smallest document
        smallest_query = query[query_number]
        docptrs[query_number][
            0] += 1  #increment pointer to the docList associated with that query
        docptr = docptrs[query_number][0]
        if docptr == len(keywordstodoc[query[query_number]][0]):
            docptrs[query_number][
                1] = True  #if docptr associated with the query has reached the end of the docList set flag to True
            minHeap.remove(
            )  #remove the root of the minHeap and heapify #check if pointer has reached the end of the doclist of the query#replace root of minHeap with the next doc in the doclist
        else:
            minHeap.removeandreplace(
                (query_number,
                 keywordstodoc[smallest_query][0][docptr]["docid"]))
        newminDoc = minHeap.min(
        )  # get the new smallest document in the heap after heapifying

        if minDoc[1] == newminDoc[
                1]:  #check if the smallest document is the same as the previous smallest
            scoreDoc.append(
                [newminDoc[0], docptrs[newminDoc[0]][0]]
            )  #if true append the query number and the index of the smallest document in that doclist to scoreDoc
            minDoc = newminDoc
        else:
            score = getScore(
                scoreDoc, query, keywordstodoc
            )  #if the smallest document has changed, score the document
            scores.append(score)
            minDoc = newminDoc
            scoreDoc = [[
                newminDoc[0], docptrs[minDoc[0]][0]
            ]]  #create new scoreDoc list as the smallest document has changed
        query_counter = 0
        for ptr in docptrs:  #check if all the doclists have been fully traversed
            if ptr[1] == True:
                query_counter += 1
        if query_counter == len(query):
            isTraversed = True
    return sort_tuple(scores)
        available_features = list(x)
        if self.tree_built:
            pres_node = self.root
            while pres_node.definiteLabel == None:
                attribute = pres_node.attribute
                if pres_node.children[available_features[attribute] - 1]:
                    pres_node = pres_node.children[
                        available_features[attribute] - 1]
                    available_features.pop(attribute)
                else:
                    return pres_node.labelSamples
        return pres_node.definiteLabel


if __name__ == '__main__':
    preProcessData = preProcess()
    preProcessData.handle_missing_values()
    preProcessData.handle_highly_correlated_features()
    df = preProcessData.return_df()
    DT = decisionTree()
    kFold = KFold(6, True, 1)
    values = df.values
    for train, test in kFold.split(values):
        print("Taking %d train datapoints" % len(train))
        train_x = values[train]
        test_x, test_y = values[test][:, :-1], values[test][:, -1]
        DT.fit(train_x)
        pred_y = DT.predict(test_x)
        results = testResults(pred_y, test_y)
        print("Accuracy of model is ", results.return_accuracy())
        print('F Score of model is ', results.return_fscore())
예제 #9
0
processes = []  #Array of current processes
subprocesses = []  #Array of subproccesses
#--------------------------------------------------------------------

################## Timer ##########################

if config.TIME_RUN:
    print("Start Timing")
    start_time = time.time()

################### Pre-Process ###################

if config.PRE_PROCESS:
    print("Pre-Processing")
    pre.preProcess(os, utils, random, config.DE_EFFECT, userMovieRating,
                   config.TEST_SUBSET, config.PROCESS_TAGS,
                   config.PROCESS_SOCIAL, config.PROCESS_HISTORY, processes,
                   mproc)
    print("Pre-Processing Complete")

################### Setup Models ###################

if config.SETUP_MODELS:
    setupModels.setupModels(sys, os, utils, config, random, mproc, modelList)

################# Run Models ###################

if config.RUN_MODELS:
    runModels.runModels(sproc, modelList, testPredictionPaths,
                        CVPredictionPaths, config.TRIALS, RMSEPaths, False)

    #### Fix #####
processes = []          #Array of current processes
subprocesses = []       #Array of subproccesses
#--------------------------------------------------------------------

################## Timer ##########################

if config.TIME_RUN:
        print("Start Timing")
        start_time = time.time()

################### Pre-Process ###################

if config.PRE_PROCESS:
    print("Pre-Processing")
    pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating,\
                   config.LAPTOP_TEST,config.PROCESS_MOVIE_TAG,\
                   config.PROCESS_SOCIAL,config.PROCESS_HISTORY)
    print("Pre-Processing Complete")

################### Setup Models ###################

if config.SETUP_MODELS:
    setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) 

################# Run Models ###################

if config.RUN_MODELS:
    runModels.runModels(sproc,modelList,
                testPredictionPaths,CVPredictionPaths,
                config.TRIALS,RMSEPaths,False)
예제 #11
0
from scipy.sparse import bsr_matrix
from preProcess import preProcess
from loadSparseData import loadSparseData
from PA import PA

fname1 = 'pu1'
fname2 = 'pu2'
fname3 = 'pu3'
fname4 = 'pua'
X1, Y1 = loadSparseData(fname1)
X2, Y2 = loadSparseData(fname2)
X3, Y3 = loadSparseData(fname3)
X4, Y4 = loadSparseData(fname4)

print 'preProcess X'
X1 = preProcess(X1)
X2 = preProcess(X2)
X3 = preProcess(X3)
X4 = preProcess(X4)

print 'start COML'

def update_CM(i, X, Y, w, u):
    x_t = X[i]
    f_t = dot(w, x_t)
    loss_w = max(0, 1 - Y[i] * f_t)
    if loss_w == 0:
        tau = 0
    else:
        tau = min(C, (fai1+fai2-dot(Y[i]*(fai1*w+fai2*u), x_t)/linalg.norm(x_t)))
    w = fai1 * w + fai2 * u + tau * Y[i] * x_t.reshape(1, len(x_t))
예제 #12
0
파일: client.py 프로젝트: epmcj/nextflix
def main():

    testMode = True  #set this value to False if you want to recover data from disk

    videoFile = '../assets/chaplinCut.mp4'
    grayScale = True

    frameRate = 25
    #the gap of frames that will be waited each time the video stops.
    #increase this value for the video to be more fluid
    framesBeforeStart = 30
    #The maximum number of objects that can wait for new data
    #increase this value if you want a better quality
    receiveWindow = 30

    #reading the pre-processed data
    Cats = []
    Success = False

    if not testMode:
        for catIndex in range(2):
            filename = ds.genFileName(videoFile, catIndex)
            meta, cat = ds.load(filename)
            if len(cat) > 0:
                Cats.append(cat)
                Success = True

    #if id does not exists, create it
    if not Success:
        nCats = 2
        msgRedundancies = [0, 0]
        exp = 2
        fixed = 1
        msgPeriods = [180, 90]
        msgSize = [1, 2]
        Cats = pp.preProcess(videoFile,nCats,msgPeriods,\
         msgRedundancies,msgSize,exp,fixed,grayScale)
        #Cats = pp.preProcess(videoFile,2,[600,480,360,240,120],[0,0,0,0,0],\
        #[1,1.2,1.4,1.6,1.8],2,1,grayScale)
        # Cats = pp.preProcess(videoFile,5,[5,4,3,2,1],[0,0,0,0,0],[1,2,3,4,5],2,1)
        if not testMode:
            print('Saving data...')
            ds.dump(videoFile, Cats)

        #the video buffer
        buff = vb.Buff(10000, len(Cats[0][0].channel))
    else:
        #the video buffer
        buff = vb.Buff(10000, meta.nChannels)

    #the messages arrive
    for cat in Cats:
        for msg in cat:
            buff.write(msg)

    buff.finished = True

    vc.startDisplayMechanism(framesBeforeStart, receiveWindow, buff, frameRate)

    while buff.getCode_displayer() != -1:
        pass

    print('Exiting application')
예제 #13
0
	def selection(thresh = 2, lenThreshRatio = 1.0/10):
		verLines = self.verLines
		horLines = self.horLines




if __name__ == '__main__':
	
	for index in range(2,6):
		i = str(index)
		filePath = "images/tree"+ i +"_wo.jpg"
		image = cv.imread(filePath,0)
		plt.imshow(image, cmap='Greys_r')
		plt.show()
		p = preProcess(image)
		p.bolding()
		p.displayImage()
		oriImage = p.returnImage()
		a = scaleCornerDetection(oriImage)
		a.parsing()

	# for index in range(1,5):
	# 	filePath = "images/tree%d_wo.jpg" %index
	# 	print filePath
	# 	image = cv.imread(filePath,0)
	# 	a = cornerDetection(image)
	# 	cv.imwrite('images/tree%d_corner.jpg', a, [cv.IMWRITE_JPEG_QUALITY, 100])

	# image = cv.imread("images/tree3.jpg", 0)
                        #where CV predictions are saved
processes = []          #Array of current processes
subprocesses = []       #Array of subproccesses
#--------------------------------------------------------------------

################## Timer ##########################

if config.TIME_RUN:
        print("Start Timing")
        start_time = time.time()

################### Pre-Process ###################

if config.PRE_PROCESS:
    print("Pre-Processing")
    pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating)
    print("Pre-Processing Complete")

################### Setup Models ###################

if config.SETUP_MODELS:
    setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) 

################# Run Models ###################

if config.RUN_MODELS:
    runModels.runModels(sproc,modelList,
                testPredictionPaths,CVPredictionPaths,
                config.TRIALS,RMSEPaths,False)

    #### Fix #####
예제 #15
0
processes = []          #Array of current processes
subprocesses = []       #Array of subproccesses
#--------------------------------------------------------------------

################## Timer ##########################

if config.TIME_RUN:
        print("Start Timing")
        start_time = time.time()

################### Pre-Process ###################

if config.PRE_PROCESS:
    print("Pre-Processing")
    pre.preProcess(os,utils,random,config.DE_EFFECT,userMovieRating,
                   config.TEST_SUBSET,config.PROCESS_TAGS,
                   config.PROCESS_SOCIAL,
                   config.PROCESS_HISTORY,processes,mproc)
    print("Pre-Processing Complete")

################### Setup Models ###################

if config.SETUP_MODELS:
    setupModels.setupModels(sys,os,utils,config,random,mproc,modelList) 

################# Run Models ###################

if config.RUN_MODELS:
    runModels.runModels(sproc,modelList,
                testPredictionPaths,CVPredictionPaths,
                config.TRIALS,RMSEPaths,False)
예제 #16
0
    '''
    df = formatPreds(dataDict, preds)
    errors = evaluate(df, ensemble=False)
    mape = errors["NN"]["mape"]
    rmse = errors["NN"]["rmse"]
    print("{:<10.1%}{:.2f}".format(mape, rmse))
    return (mape, rmse)


if __name__ == "__main__":
    args = getArgs()
    config = getConfig()

    df = getData(config)
    df = df.sample(frac=0.3)
    dataDict = preProcess(df, config, args)

    parms = getParms("NN")  # The hyper-parameter combinations to be tested

    results = []
    count = 1

    start_time = time.time()
    print("\n{} parameter combinations".format(len(parms)))
    print("\n{:<10}{}".format("MAPE", "RMSE"))

    for x in parms:
        parmDict = loadParms(x)

        preds = runNN(dataDict, parmDict, config)
예제 #17
0
 def make_index(self):
     print("Building Index....")
     for doc in self.docs:
         self.docCount += 1
         id = doc['_id']
         print("Processing URL for doc {}".format(id))
         url = preProcessUrl(doc['url'])
         print("Processing transcript for doc {}".format(id))
         transcript = preProcess(doc['transcript'])
         for i in range(len(transcript)):
             word = transcript[i]
             if (word != ''):
                 if word not in self.index.keys(
                 ):  #word has not been seen in any document
                     self.index[word] = {
                         "word":
                         word,
                         "idf":
                         1,
                         "docList": [{
                             'docid': id,
                             'body': [i],
                             'tf': 1,
                             'url': []
                         }]
                     }
                 elif id != self.index[word]["docList"][-1][
                         'docid']:  #word has not been seen in the current document
                     self.index[word]["docList"].append({
                         'docid': id,
                         'body': [i],
                         'tf': 1,
                         'url': []
                     })
                     self.index[word]["idf"] += 1
                 else:  #word has been seen before in the document
                     self.index[word]["docList"][-1]['body'].append(
                         i
                     )  #append position of the word in the doc in the index
                     self.index[word]["docList"][-1]['tf'] += 1
         for i in range(len(url)):
             word = url[i]
             if word != '':
                 if word not in self.index.keys(
                 ):  #word has not been seen in any document
                     self.index[word] = {
                         "word":
                         word,
                         "idf":
                         1,
                         "docList": [{
                             'docid': id,
                             'body': [],
                             'tf': 1,
                             'url': [i]
                         }]
                     }
                 elif id != self.index[word]["docList"][-1][
                         'docid']:  #word has not been seen in current document
                     self.index[word]["docList"].append({
                         'docid': id,
                         'body': [],
                         'tf': 1,
                         'url': [i]
                     })
                     self.index[word]["idf"] += 1
                 else:  #word has been seen before in the document
                     self.index[word]["docList"][-1]['url'].append(i)
                     self.index[word]["docList"][-1]['tf'] += 1
     wordCount = 0
     for word in self.index.keys():
         wordCount += 1
         self.index[word]["idf"] = math.log(self.docCount /
                                            (self.index[word]["idf"] + 1))
         #print(word + '....{}'.format(self.index[word]["idf"]))
     print("Index with {} words successfully made".format(wordCount))