Пример #1
0
def test():
    start=''
    clothes = feature.Clothes()
    print 'press q to exit'
    args = parse_args()
    clothes = feature.Clothes(init_pca=1)
    index = cPickle.loads(open(args.index).read())
    searcher = Searcher(index)
    while True:
        start = raw_input('please input image address: ')       
        if start == 'q':
            break
        start = 'query-images/' + start
        try:
            queryImage = cv2.imread(start)
            queryFeatures = descriptor.get_descriptor(image_path, multi_box=False)
            results = searcher.search(queryFeatures)
            result = []
            result.append(queryImage)
            for j in xrange(0, 15):
                # grab the result (we are using row-major order) and
                # load the result image
                imageName = results[j]
                #path = args.dataset + "/%s" % (imageName)
                result.append(cv2.imread(imageName))
            plot(result)
        
            plt.show()
        except:
            print "error"
Пример #2
0
def testSave():
    args = parse_args()
    searcher = Searcher('data/tree','data/index')
    imagesDir = args.query
    ext = ['jpg','png','jpeg','JPG','PNG','JPEG']
    images = os.listdir(imagesDir)
    flag = 1
    import time
    start = time.time()
    for image in images:
        imType = image.split('.')[-1]
        if imType in ext:
            image_path = os.path.join(imagesDir,image)
            try:
                queryImage = cv2.imread(image_path)
                queryFeatures = descriptor.get_descriptor(image_path, multi_box=False)
                results = searcher.search(queryFeatures)
                result = []
                result.append(queryImage)
                for j in xrange(0, 14):
                    # grab the result (we are using row-major order) and
                    # load the result image
                    imageName = results[j]
                    #path = args.dataset + "/%s" % (imageName)
                    #print os.path.join(args.dataset,imageName)
                    result.append(cv2.imread(os.path.join(args.dataset,imageName)))
                    #print "\t%d. %s : %.3f" % (j + 1, imageName, score)
                plot(result,flag,args.save)
                flag += 1
            except:
                print 'error with',image_path
    print 'Total time: ',time.time() - start                 
Пример #3
0
def search():
    """
    When a user enters a search query, it is obtained here. It calculates the
    top documents by using the Searcher class.
    :return: A template which is populated by input_query, top 20 results,
    cosine scores of various query_words, and words whose document frequency
    is zero.
    """
    query = request.form.get('searchBar')
    query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore')
    now = time.clock()
    searcher = Searcher(query)
    results = searcher.cosine_score()
    scores = searcher.query_score
    print time.clock() - now
    zero_scores = searcher.top_corrections
    boolean_results = searcher.boolean_results
    if len(boolean_results) == 0:
        boolean_error = True
    else:
        boolean_error = False
    title_results = searcher.title_results
    if len(title_results) > 10:
        title_results = []
    return render_template("displayResults.html",
                           input_query=query,
                           results=results,
                           scores=scores,
                           zero_scores=zero_scores,
                           title_results=title_results,
                           error=boolean_error,
                           boolean_results=boolean_results)
Пример #4
0
def test():
    start=''
    print 'press q to exit'
    args = parse_args()
    index = cPickle.loads(open(args.index).read())
    searcher = Searcher(index)
    while True:
        start = raw_input('please input image address: ')       
        if start == 'q':
            break
        #start = 'query-images/' + start
        try:
            queryImage = cv2.imread(start)
            queryFeatures = clothes.run(start)
            results = searcher.search(queryFeatures)
            result = []
            result.append(queryImage)
            for j in xrange(0, 15):
                # grab the result (we are using row-major order) and
                # load the result image
                (score, imageName) = results[j]
                #path = args.dataset + "/%s" % (imageName)
                result.append(cv2.imread(imageName))
                print "\t%d. %s : %.3f" % (j + 1, imageName, score)
            plot(result)
        
            plt.show()
        except:
            print "error"
Пример #5
0
def main():
    # get options from console.
    options = args()

    # get configuration from file.
    config = get_conf(options['config_file'])

    # create ES connection to hosts.
    connections.create_connection(hosts=config['elasticsearch']['hosts'],
                                  timeout=30)

    # create the searcher instance to find alarms, given the options from
    # console.
    searcher = Searcher(options['from'],
                        options['query'],
                        ttime=options['to'],
                        per_page=500,
                        min_priority=options['min_priority'])

    buckets = [
        PathClassBucket(
            utils.build_url(config['kibana']['host'],
                            config['kibana']['secure']))
    ]

    # manually fetch all alarms from the searcher and pass it to every bucket.
    for alarm in searcher.pages():
        for bucket in buckets:
            bucket.cherry_pick(alarm)

    # dump all buckets, this will print out all buckets.
    for bucket in buckets:
        bucket.dump()
Пример #6
0
def search(request):
	"""
	The main method to return search result
	Param request: the request obj
	"""
	result_template = get_template('result.html')
        st = time.time()
	query = request.GET['q']
        query = query.encode('utf-8')
        searcher = Searcher()
        result = searcher.search_result(query)
        paginator = Paginator(result,10)
        try:
            page = int(request.GET.get('page','1'))
        except ValueError:
            page = 1
        try:
            page_result = paginator.page(page)
        except(EmptyPage,InvalidPage):
            page_result = paginator.page(paginator.num_pages)
        #file_result = searcher.search_file(query)
        file_result = []
        search_time = time.time()-st
        search_time = "%.3f" % search_time
        html = result_template.render(Context({'query':query,'result':page_result,'file_result':file_result,"search_time":search_time}))
	return HttpResponse(html)
Пример #7
0
def search():
    query = request.form['query']
    field = request.form['field']
    searcher = Searcher()
    result = searcher.search(query, field)
    return render_template("results.html",
                           query=query,
                           videos=result["videos"])
Пример #8
0
class TestSearch(unittest.TestCase):

    doc1 = {'path': '/foo/doc1',
            'keywords': 'document,one,foo',
            'title': 'Document one title.',
            'text': 'This is the a test document. Wot!'}

    doc2 = {'path': '/foo/doc2',
            'keywords': 'document,two,bar',
            'title': 'Document two title.',
            'text': 'Testing is cool, yo.'}

    def setUp(self):
        self.searcher = Searcher(tempfile.mkdtemp())
        self.searcher.add_documents(json.dumps((self.doc1, self.doc2)))

    def test_search_by_text(self):
        results = self.searcher.search('Wot')
        self.assertEqual(1, len(results))
        for fieldname, fieldvalue in results[0].items():
            self.assertEqual(self.doc1.get(fieldname), fieldvalue)

        # Now stemming. Document 2 only has "testing" in its body but it'll be
        # matched anyway because the "text" field of the schema uses the
        # StemmingAnalyzer.
        results = self.searcher.search('test')
        self.assertEqual(2, len(results))
        matched_doc_paths = [hit['path'] for hit in results]
        self.assertIn('/foo/doc1', matched_doc_paths)
        self.assertIn('/foo/doc2', matched_doc_paths)

    def test_search_by_keyword(self):
        # Simple search by a single keyword.
        results = self.searcher.search('keywords:one')
        self.assertEqual(1, len(results))  # doc1
        self.assertEqual(results[0]['path'], '/foo/doc1')

        # By two keywords.
        results = self.searcher.search('keywords:one,foo')
        self.assertEqual(1, len(results))  # doc1
        self.assertEqual(results[0]['path'], '/foo/doc1')

        # Keyword intersection.
        results = self.searcher.search('keywords:one,two')
        self.assertEqual(0, len(results))  # There are none.

        # Search for docs with a "one" keyword and "yo" in the body.
        results = self.searcher.search('keywords:one yo')
        self.assertEqual(0, len(results))  # There are none.

        # Anything with the "document" keyword.
        results = self.searcher.search('keywords:document')
        self.assertEqual(2, len(results))  # Both docs.

    def test_search_by_title(self):
        results = self.searcher.search('title:"Document one"')
        self.assertEqual(1, len(results))  # doc1
        self.assertEqual(results[0]['path'], '/foo/doc1')
Пример #9
0
    def colorSearch(self):
        '''
        Searches query image against index and returns the specified number of matches.
        Results are in the format (chi-squared distance, image name).
        '''
        searcher = Searcher(self.colorIndex)
        queryFeatures = self.createHistogram(self.image)

        results = searcher.search(queryFeatures)
        return results
Пример #10
0
    def colorSearch(self):
        '''
        Searches query image against index and returns the specified number of matches.
        Results are in the format (chi-squared distance, image name).
        '''
        searcher = Searcher(self.colorIndex)
        queryFeatures = self.createHistogram(self.image)

        results = searcher.search(queryFeatures)
        return results
Пример #11
0
 def aftermath(self, results):
     try:
         keywords = self.analyse_keywords(results)
         sentiment = self.analyse_sentiment(results)
     except Exception as e:
         print(e)
     searcher = Searcher()
     for keyword in keywords:
         imageResults = searcher.searchImages(keyword)
         imageResults = searcher.validateLinks(imageResults)
         self.completionFunction(self.name, keyword, sentiment,
                                 imageResults)
Пример #12
0
class ImageSearcher():
    '''Image searcher API for clothes retrieval web demo'''
    def __init__(self):
        root_path = os.path.dirname(__file__)
        tree_path = os.path.abspath(os.path.join(root_path, 'db/tree'))
        inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
        feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
        self.searcher = Searcher(tree_path, inds_path, feature_path)
        

    def search(self, image_path, do_detection=0, k=20): 
        t1 = Timer()
        t1.tic()
        #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False)
        queryFeatures = descriptor.get_descriptor(image_path,do_detection=do_detection)
        
        t1.toc('Feature Extraction time: ')
        t2 = Timer()
        t2.tic()
        results, dists, ind = self.searcher.search(queryFeatures,k=k)
        #self.queryExpansion(results, dists, ind)
        #self.queryExpansion(results, dists, ind)
        t2.toc('Knn search time: ')
        result = []
        dist = []
        for j,imageName in enumerate(results):
            if imageName not in result:
                result.append(imageName)
                dist.append(dists[j])
        return result[:k],dist[:k]
        
    def queryExpansion(self, results, dists, ind, threshold=0.8, k=10, top=4):
        """
        Do Query Expansion with at most top 5
        """
        features = self.searcher.features
        feature = []
        for i,dist in enumerate(dists):
            if dist < threshold and i < top:
                feature.append(features[ind[i]])
        if len(feature) == 0:
            return 0
        query = np.mean(np.array(feature), axis=0)
        new_results, new_dists, new_ind = self.searcher.search(query,k=k)
        for i,dist in enumerate(new_dists):
            if dist > dists[-1]:
                break
            for j,d in enumerate(dists):
                if dist < d:
                    results.insert(j, new_results[i])
                    dists.insert(j, dist)
                    break 
Пример #13
0
def mainapp():
    result = ""
    input = request.args.get('searchstr')
    sorttype = request.args.get('sortselect')
    results = []
    if ((input is not None and sorttype is not None) and input != ""):
        searcher = Searcher(input, sorttype)
        results = searcher.search()
    if input is None:
        input = ""
    widgets = [r.widget for r in results]
    #return render_template('index.html', input=input,result="".join(widgets))
    return render_template('index2.html', input=input, results=results)
Пример #14
0
def mainapp():
	result=""
	input = request.args.get('searchstr')
	sorttype = request.args.get('sortselect')
	results = []
	if((input is not None and sorttype is not None) and input != ""):
		searcher = Searcher(input,sorttype)
		results = searcher.search()
	if input is None:
		input = ""
	widgets = [r.widget for r in results]
	#return render_template('index.html', input=input,result="".join(widgets))
	return render_template('index2.html', input=input,results=results)
Пример #15
0
    def visualize_specific_ranker(self, query):
        viewer = GraphViewer()
        specific_documents = Searcher(query).get_topic_documents()
        m = self.basic_matrix
        specific_doc_ids = list()
        with closing(shelve.open("ids.db")) as db:
            for doc in specific_documents:
                specific_doc_ids.append(db[doc[16:]])
        specific_vector = np.zeros(m.shape[0])
        for doc_id in specific_doc_ids:
            specific_vector[doc_id] = (1 - self.taxation_factor) / \
                                      len(specific_doc_ids)

        rank_vector = np.full(m.shape[0], 1 / m.shape[0])
        print len(specific_doc_ids)
        viewer.view_graph(node_list=list(specific_doc_ids))
        count = 0
        while True:
            count += 1
            rank_vector1 = m * rank_vector + specific_vector
            diff = rank_vector1 - rank_vector
            diff = sum(diff * diff)
            if diff < 1e-50:
                break
            else:
                rank_vector = rank_vector1
                if count % 25 == 0:
                    try:
                        viewer.view_graph(node_list=list(specific_doc_ids),
                                          ranks=list(rank_vector),
                                          mult_factor=40000,
                                          concat=150)
                    except networkx.exception.NetworkXError as e:
                        print e
Пример #16
0
class Index(object):

    def __init__(self, cache_host=None, cache_port=None, db_file_path=None,
                 db_url=None, load_from_db=None):
        cache = create_index_cache(host=cache_host, port=cache_port)
        db = create_index_store(file_path=db_file_path, url=db_url)

        self.reader = IndexReader(db, cache)
        self.writer = IndexWriter(db, cache)
        self.searcher = Searcher(self.reader)

        if load_from_db:
            self.load_from_db()
    def search(self, query):
        return self.searcher.search(query)

    def commit(self):
        self.writer.db.commit()
        self.load_from_db()

    def load_from_db(self):
         # Refresh data in reader with data from database
        self.reader.load_from_db()

        # Push the new data into the cache
        self.writer.build_cache(self.reader.doc_word_scores)

    def writer(self):
        return self.writer

    def reader(self):
        return self.reader
Пример #17
0
def testSave():
    args = parse_args()
    #searcher = ImageSearcher()
    searcher = Searcher(args.index, args.feature)
    imagesDir = args.query
    ext = ['jpg','png','jpeg','JPG','PNG','JPEG']
    images = os.listdir(imagesDir)
    os.system('rm %s'%(args.save+'/*'))
    f = open(os.path.join(args.save,'info.txt'),'w')
    flag = 1
    import time
    start = time.time()
    for image in images:
        imType = image.split('.')[-1]
        if imType in ext:
            image_path = os.path.join(imagesDir,image)
            try:
                queryImage = cv2.imread(image_path)
                queryFeatures = descriptor.get_descriptor(image_path)
                results,dists,ind = searcher.search(queryFeatures,k=15)
                #results,dists = searcher.search(image_path, do_detection=1, k=15)
                result = []
                result.append(queryImage)
                dist = []
                dist.append(1)
                print '~~~~~~~~~~~~~~~',flag,'~~~~~~~~~~~~~~~~~~~'
                f.write(str(flag)+'.~~~~~~~~~~~~~~~~'+'\n')
                f.write(image_path+'\n')
                for j in xrange(0, 15):
                    # grab the result (we are using row-major order) and
                    # load the result image
                    score,imageName = dists[j], results[j]
                    #path = args.dataset + "/%s" % (imageName)
                    #print os.path.join('images-to-index',imageName)
                    #result.append(cv2.imread(os.path.join(args.dataset,imageName)))
                    result.append(cv2.imread(imageName))
                    f.write(imageName+'\n')
                    dist.append(score)
                    #print "\t%d. %s : %.3f" % (j + 1, imageName, score)
                    #print score
                plot(result,dist,flag,args.save)
                flag += 1
            except:
                print 'error with',image_path
    f.close()
    print 'Total time: ', time.time() - start                 
Пример #18
0
def search():
    query = request.args.get("q")
    if query is None:
        return render_template("index.html")

    try:
        page = int(request.args.get("p", 1))
    except (TypeError, ValueError):
        page = 1

    searcher = Searcher()
    results = searcher.search_page(query, page)
    paginator = Paginator(results)
    return render_template("index.html",
                           results=results,
                           paginator=paginator,
                           q=query)
Пример #19
0
 def __init__(self):
     root_path = os.path.dirname(__file__)
     #tree_path = os.path.abspath(os.path.join(root_path, 'db/tree5'))
     inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
     feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
     self.searcher = Searcher(tree_path, inds_path, feature_path)
     #self.dataset = os.path.abspath(os.path.join(root_path, '../CBIR/datasets'))
     #label_path = os.path.abspath(os.path.join(root_path, 'db/label.pkl'))
     self.label = cPickle.loads(open(label_path).read()) 
Пример #20
0
class qrels:
    """
    A class to create the qrels of the questions for SQUAD dataset.
    """
    def __init__(self, index_dir):
        self.searchObject = Searcher(index_dir)

    def get_id_section(self, pair):
        """
        This is the function that returns the id of the passage that is similar to the context.
        :param index_dir: the folder where the dataset index is stored.
        :param input_query: the query that represents the context.
        :return:
        """
        result = self.searchObject.pairSearch(pair, BM25Similarity())
        id = None
        for i in range(len(result)):
            hitDoc = self.searchObject.searcher.doc(result[i].doc)
            id = hitDoc.get("id_section")
            content = hitDoc.get("content_section")
            if id != "":
                break
        return id, content

    def process(self, input_file, output_dir):
        """
        This is the main function that creates the qrels file.
        :param input_file: the file to process.
        :param index_dir: the folder where the dataset index is stored.
        :param output_dir: the folder where to store the qrels file.
        :return:
        """
        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    num_lines += 1

        output_file = open(output_dir + "/qrels.txt", 'a+', encoding="utf-8")
        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    for par in p['paragraphs']:
                        pbar.update(1)
                        psg_id, content = self.get_id_section(
                            (p['title'], par["context"]))
                        # print("Content: "+content+"\n")
                        #similarity = round(len(set(par["context"]) & set(content)) / len(set(par["context"])), 2)
                        for q in par["qas"]:
                            qst_id = q["id"]
                            if q["is_impossible"] is False:
                                output_file.write(qst_id + " 0 " + psg_id +
                                                  " 1 \n")
                print("==> Qrels successfully created.\n")
Пример #21
0
    def colorSearch(self, max_matches=5):
        '''
        Searches query image against index and returns the specified number of matches.
        Results are in the format (chi-squared distance, image name).
        '''
        self.index = self.createIndex()

        image = cv2.imread(self.image)
        print("Querying: " + self.image + " ...")
        searcher = Searcher(self.index)
        queryFeatures = self.createHistogram(image)

        results = searcher.search(queryFeatures)[:max_matches]

        print("Matches found:")
        for j in range(len(results)):
            (score, imageName) = results[j]
            print("\t%d. %s : %.3f" % (j+1, imageName, score))

        return results
Пример #22
0
    def searchByColor(self):
        '''
        Searches query image against index and returns the specified number of matches.
        '''

        MAX_NUMBER_MATCHES = 5

        image = cv2.imread(self.image)
        print("Querying: " + self.image + " ...")
        searcher = Searcher(self.index)
        queryFeatures = self.createHistogram(image)

        results = searcher.search(queryFeatures)[:MAX_NUMBER_MATCHES]

        print("Matches found:")
        for j in range(len(results)):
            (score, imageName) = results[j]
            print("\t%d. %s : %.3f" % (j+1, imageName, score))

        return results
Пример #23
0
    def __init__(self, cache_host=None, cache_port=None, db_file_path=None,
                 db_url=None, load_from_db=None):
        cache = create_index_cache(host=cache_host, port=cache_port)
        db = create_index_store(file_path=db_file_path, url=db_url)

        self.reader = IndexReader(db, cache)
        self.writer = IndexWriter(db, cache)
        self.searcher = Searcher(self.reader)

        if load_from_db:
            self.load_from_db()
Пример #24
0
def weighted_search():
    """
    When a user Enters weights to use for different query_words, those are
    obtained by this method.
    :return: A template populated by input_query, results obtained due to
    weighted search, the scores of various query_words as given by the user
    and words whose document frequency is zero
    """
    weights = {}
    query = request.form.get("query")
    for key in request.form:
        if key == "query":
            query = request.form[key]
            query = unicodedata.normalize('NFKD',
                                          query).encode('ascii', 'ignore')

        else:
            weights[key] = request.form[key]
            weights[key] = unicodedata.normalize('NFKD', weights[key]).encode(
                'ascii', 'ignore')
            weights[key] = float(weights[key]) / 100

    searcher = Searcher(query, query_score=weights)
    results = searcher.cosine_score()
    scores = searcher.query_score
    zero_scores = searcher.top_corrections
    boolean_results = searcher.boolean_results
    if len(boolean_results) == 0:
        boolean_error = True
    else:
        boolean_error = False
    title_results = searcher.title_results
    return render_template("displayResults.html",
                           input_query=query,
                           results=results,
                           scores=scores,
                           zero_scores=zero_scores,
                           title_results=title_results,
                           error=boolean_error)
Пример #25
0
def run(args):
    # TODO update args according to config.json here
    setup = import_module(args.setup)
    task = args.task
    data_source = setup.DataSource(args)
    spec = setup.LearningSpec()

    if task == 'train':
        trainer = Trainer(args, data_source, spec.training_classifier())
        trainer.run()
    elif task == 'search':
        searcher = Searcher(args, data_source)
        searcher.fit(spec.gridsearch_pipelines())
    elif task == 'benchmark':
        run_benchmarks(args, data_source)
    elif task == 'learning_curves':
        run_learning_curves(args, spec, data_source)
    elif task == 'plot_pca':
        X, y = data_source.train_data()
        plot_pca(X, y)
    elif task == 'misclassified':
        print_misclassified(args, spec.training_classifier(), data_source)
Пример #26
0
 def topic_specific_search(self, query, scheme="topic"):
     if scheme == "trust":
         rank_vector = np.load("trustRank.npy")
     else:
         rank_vector = self.topic_specific_ranker(query)
     results = Searcher(query).cosine_score(ranker=True)
     result_ids = []
     with closing(shelve.open("ids.db")) as db:
         for doc, score in results:
             doc_id = db[doc[16:]]
             doc_rank = rank_vector[doc_id]
             result_ids.append((doc_id, doc_rank))
     sorted_scores = heapq.nlargest(20,
                                    result_ids,
                                    key=operator.itemgetter(1))
     return sorted_scores
Пример #27
0
class PopularTopics:
    def __init__(self, index_dir, analyzer):
        self.searcher = Searcher(index_dir, analyzer)

    def dict_append(self, entity, f_dist):
        entity = ' '.join(entity)
        if entity not in f_dist:
            f_dist[entity] = 0
        f_dist[entity] += 1

    def get_popular_topics(self, q_year, top_k):
        titles = self.searcher.search_year(q_year)
        unigram_dist = {}
        bigram_dist = {}
        trigram_dist = {}
        ngram_dist = {}

        tagset = None
        tagger = PerceptronTagger()
        grammar = "NP: {<JJ>*(<NN>|<NNS>)*<NN>(<NN>|<NNS>)*}"
        cp = nltk.RegexpParser(grammar)
        for title in titles:
            title = title.lower()
            text = word_tokenize(title)
            sentence = nltk.tag._pos_tag(text, tagset, tagger)
            result = cp.parse(sentence)
            for node in list(result):
                if isinstance(node, nltk.tree.Tree):
                    entity = zip(*list(node))[0]
                    if len(entity) == 1:
                        self.dict_append(entity, unigram_dist)
                    elif len(entity) == 2:
                        self.dict_append(entity, bigram_dist)
                    elif len(entity) == 3:
                        self.dict_append(entity, trigram_dist)
                    else:
                        self.dict_append(entity, ngram_dist)

        unigram_result = Counter(unigram_dist).most_common(int(len(unigram_dist) * 0.01) + top_k)[int(len(unigram_dist) * 0.01):]
        bigram_result = Counter(bigram_dist).most_common(top_k)
        trigram_result = Counter(trigram_dist).most_common(top_k)

        result = unigram_result + bigram_result + trigram_result
        result = sorted(result, key=lambda k: k[1], reverse=True)[:top_k]
        return result
Пример #28
0
def main(args):
    searcher = Searcher(args["limit"])
    workers = []

    if "domains" in args:
        # load domains from file
        domains = load_domains(args["domains"])

        for domain in domains:
            if domain == "":
                continue

            # lookup in search engine
            result = searcher.google_search(domain)

            # start the worker
            worker = Worker(domain, result.urls, result.page_source)
            workers.append(worker)

        print "\nNow waiting for workers to finish"

    else:
        # lookup in search engine
        result = searcher.google_search(args["domain"])

        # start the worker
        worker = Worker(args["domain"], result.urls, result.page_source)
        workers.append(worker)

    searcher.close()

    # wait for all workers to finish
    for worker in workers:
        worker.wait()

    # write emails to a file
    if "output" in args:
        write_excel_file(args["output"], workers)

    print "\nFinished scraping!\n"

    # output all emails
    for worker in workers:
        for email in worker.emails:
            print "> " + email
Пример #29
0
    def topic_specific_ranker(self, query):
        specific_documents = Searcher(query).get_topic_documents()
        m = self.basic_matrix
        specific_doc_ids = list()
        with closing(shelve.open("ids.db")) as db:
            for doc in specific_documents:
                specific_doc_ids.append(db[doc[16:]])
        specific_vector = np.zeros(m.shape[0])
        for doc_id in specific_doc_ids:
            specific_vector[doc_id] = (1 - self.taxation_factor) / \
                                      len(specific_doc_ids)

        rank_vector = np.full(m.shape[0], 1 / m.shape[0])
        while True:
            rank_vector1 = m * rank_vector + specific_vector
            diff = rank_vector1 - rank_vector
            diff = sum(diff * diff)
            if diff < 1e-50:
                break
            else:
                rank_vector = rank_vector1

        return rank_vector
Пример #30
0
#coding:utf-8
from index import Indexer
from search import Searcher

if __name__ == '__main__':
    index = Indexer("docs.txt")
    searcher = Searcher(index)

    i = 0
    while 1:
        i += 1

        input = raw_input(str(i) + ".请输入问题:")
        doclist = searcher.search(input.decode('utf-8'))

        if len(doclist) > 0:
            for doc in doclist:
                print doc.id, doc.name, doc.text
        else:
            print "无相关结果"
        print "\n"
Пример #31
0
def main():
    server_url = 'localhost:9200'
    num_queries = 1000

    with open('hyper_params_set.json', 'r') as fh:
        hyper_params = json.load(fh)
        nums_groups = hyper_params['nums_groups']
        nums_clusters = hyper_params['nums_clusters']
        thresholds = hyper_params['thresholds']
        fh.close()

    with open('evaluation_set.json') as f:
        evaluation_set = json.load(f)
        f.close()

    final_results = []

    training_embedding_vectors = np.load("train_embs_VGGFace.npy")
    query_vector_indices = random.sample(range(len(evaluation_set.keys())),
                                         num_queries)
    train_labels, image_names = get_image_data(
        'vn_celeb_face_recognition/train.csv')

    for threshold in thresholds:
        for num_groups in nums_groups:
            for num_clusters in nums_clusters:

                print("working on {} groups, {} clusters, {} threshold".format(
                    num_groups, num_clusters, threshold))
                search_times = []
                mean_average_accuracy = 0
                mean_recall = 0
                for query_vector_index in query_vector_indices:

                    query_vector = training_embedding_vectors[evaluation_set[
                        str(query_vector_index)][0]]
                    actual_query_label = train_labels[evaluation_set[str(
                        query_vector_index)][0]]
                    num_actual_results = len(
                        evaluation_set[str(actual_query_label)])
                    # print(actual_query_label)
                    # print("------------")

                    es = Elasticsearch(server_url)
                    index_name = 'face_off_' + str(
                        num_groups) + 'groups_' + str(
                            num_clusters) + 'clusters_vgg'
                    if not es.indices.exists(
                            index_name
                    ):  # if data is not indexed, create index and take data to ES
                        # then query
                        indexer = ESIndexer('encode_results_vgg', num_groups,
                                            num_clusters, server_url, 'vgg')
                        indexer.index()

                        start_time = datetime.now()
                        searcher = Searcher(threshold, num_groups,
                                            num_clusters, query_vector,
                                            server_url, index_name, 'cosine',
                                            'vgg')
                        results = searcher.search()
                        # print(len(results))
                        if len(results) == 0: continue
                        search_time = datetime.now() - start_time
                        search_time_in_ms = (search_time.days * 24 * 60 * 60 +
                                             search_time.seconds) * 1000 + \
                                             search_time.microseconds / 1000.0
                        search_times.append(search_time_in_ms)
                    else:  # if not, commit query
                        start_time = datetime.now()
                        searcher = Searcher(threshold, num_groups,
                                            num_clusters, query_vector,
                                            server_url, index_name, 'cosine',
                                            'vgg')
                        results = searcher.search()
                        # print(len(results))
                        if len(results) == 0: continue
                        search_time = datetime.now() - start_time
                        search_time_in_ms = (search_time.days * 24 * 60 * 60 +
                                             search_time.seconds) * 1000 + \
                                            search_time.microseconds / 1000.0
                        search_times.append(search_time_in_ms)

                    # print(len(results))
                    results_labels = list()
                    for result in results:
                        # print(result['id'])
                        results_labels.append(result['id'])

                    # with open('evaluation_set.json', 'r') as fh:
                    #     evaluation_set_dict = json.load(fh)
                    #     fh.close()

                    accuracy_i = 0
                    for i in range(len(results)):
                        step_list = results_labels[:(i + 1)]
                        num_corrects = len([
                            i for i, x in enumerate(step_list)
                            if x == actual_query_label
                        ])
                        accuracy_i += num_corrects / len(step_list)
                    # print(accuracy_i/num_returns)
                    mean_average_accuracy += accuracy_i / len(results)

                    recall_i = num_corrects / num_actual_results
                    # print(num_corrects)
                    mean_recall += recall_i

                    # print("*************************************")
                average_search_time = round(
                    np.mean(np.asarray(search_times)) / 1000, 3)
                mean_average_accuracy = mean_average_accuracy / num_queries
                mean_recall = mean_recall / num_queries
                # print(average_search_time)
                # print(accuracy)

                final_results.append([
                    num_groups, num_clusters, threshold, num_queries,
                    'euclidean', average_search_time,
                    round(mean_average_accuracy, 4),
                    round(mean_recall, 4)
                ])
                print([
                    num_groups, num_clusters, threshold, num_queries,
                    'euclidean', average_search_time,
                    round(mean_average_accuracy, 4),
                    round(mean_recall, 4)
                ])

                print("finish")
                print("-----------------------------------------------")
Пример #32
0
 def __init__(self, index_dir, analyzer):
     """
     Initializes searcher.
     """
     self.searcher = Searcher(index_dir, analyzer)
 def __init__(self, index_dir):
     self.searcher = Searcher(index_dir)
Пример #34
0
def run_search(rt):
    searcher = Searcher(rt.config, rt.data_source)
    searcher.fit(rt.spec.gridsearch_pipelines())
Пример #35
0
 def setUp(self):
     self.searcher = Searcher(tempfile.mkdtemp())
     self.searcher.add_documents(json.dumps((self.doc1, self.doc2)))
Пример #36
0
class ImageSearcher():
    '''Image searcher API for clothes retrieval web demo'''
    def __init__(self):
        root_path = os.path.dirname(__file__)
        #tree_path = os.path.abspath(os.path.join(root_path, 'db/tree5'))
        inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
        feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
        self.searcher = Searcher(tree_path, inds_path, feature_path)
        #self.dataset = os.path.abspath(os.path.join(root_path, '../CBIR/datasets'))
        #label_path = os.path.abspath(os.path.join(root_path, 'db/label.pkl'))
        self.label = cPickle.loads(open(label_path).read()) 

    def search(self, image_path, do_detection=1, k=50): 
        #queryImage = cv2.imread(image_path)
        t1 = Timer()
        t1.tic()
        #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False)
        queryFeatures, label = descriptor.get_descriptor(image_path,
                                                         multi_box=False,
                                                         get_label=True,
                                                         do_detection=do_detection)
        flag = []
        #flag = [] # if do, we donot use class to filter result       
        t1.toc('Feature Extraction time: ')
        t2 = Timer()
        t2.tic()
        #p = Profile()
        #results = p.runcall(self.searcher.search, queryFeatures)
        #p.print_stats()
        results, dists = self.searcher.search(queryFeatures)
        print dists
        t2.toc('Knn search time: ')
        result = []
        # origine image
        #result.append(image_path)
        if len(flag) != 0:
            for j in xrange(0, k):
                imageName = results[j]
                if imageName not in result:    
                    #Juge class error but image similarity is high
                    if dists[j] < 0.05:
                        result.append(imageName)
                        continue
                    #if dists[j] > 0.2:
                    #    break
                    #judge wether image belongs to the class
                    image_path = imageName.split('/')
                    image_dir = image_path[0]+'/'+image_path[1]+'/'+image_path[2]
                    #print image_dir
                    if image_dir in flag:
                        result.append(imageName)
                    #else:
                    #    result.append(imageName)
        print 'total result', len(result)
        if len(result)<3:
            # if result about class is less than 5, we do search in all datasets
            #print 'total result', len(result)
            k = 30
            result = []
            for j in xrange(0, k):
                imageName = results[j]
                if imageName not in result:
                    #if dists[j] > 0.2:
                    #    break
                    result.append(imageName)
        
        return result 
melbourneBox = [143.7,-38.5,145.9,-37.05]

# just obfuscate, meaning these are harder to find / steal
firstHidden = base64.b64decode("VWxQNkdMc2s1TTJVVUEwVHVGZHVNREd2Yg==").decode("utf-8")
secondHidden = base64.b64decode("NHNqQ1VHWE81VGZqemM5RTNuUlFXUlVLTG1iY0M2dkFPS2p3d0tUZDYwbHFvVDNyZVM=").decode("utf-8")

thirdHidden = base64.b64decode("MjgwOTIyNTk0LXB4dTBtMnNqR01xeUU3ZTZhdmFOUUk0bmlDdXE2d2RoY202UmFRV04=").decode("utf-8")
fourthHidden = base64.b64decode("aVV4cVk3UjNCVE5lTWN6NmZRakloczJuYTRqbjV6RUx5cmtYdGdTYTFUNGs3").decode("utf-8")

# create the interface to the database
db = DBInterface(dbstring)

# authenticate
auth = tweepy.OAuthHandler(firstHidden, secondHidden)
auth.set_access_token(thirdHidden, fourthHidden)
api = tweepy.API(auth)

# decide what mode to run the application in
mode = sys.argv[1]

if mode == 'search' or mode == 'both':
    # start searching for all tweets going back a week
    sfThread = Searcher(api, db, melbourneRadial)
    sfThread.start()

if mode == 'stream' or mode == 'both':
    # start streaming tweets
    listener = Streamer(db)
    stream = tweepy.Stream(auth=api.auth, listener=listener)
    stream.filter(locations=melbourneBox)
Пример #38
0
def main_1(var):

    num_groups = int(var[0])

    num_clusters = int(var[1])

    if var[2] >= 50:
        dist_function_name = 'euclidean'
    else:
        dist_function_name = 'cosine'
    threshold = var[3]

    server_url = 'localhost:9200'
    num_queries = 200

    with open('evaluation_set.json') as f:
        evaluation_set = json.load(f)
        f.close()

    training_embedding_vectors = np.load("PCA_2048_to_512_new.npy")
    query_vector_indices = random.sample(range(len(evaluation_set.keys())),
                                         num_queries)
    train_labels, image_names = get_image_data(
        'vn_celeb_face_recognition/train.csv')

    # print("working on {} groups, {} clusters, {} threshold".format(num_groups, num_clusters, threshold))
    search_times = []
    mean_average_accuracy = 0
    mean_recall = 0

    for query_vector_index in query_vector_indices:

        query_vector = training_embedding_vectors[evaluation_set[str(
            query_vector_index)][0]]
        # print(query_vector)
        actual_query_label = train_labels[evaluation_set[str(
            query_vector_index)][0]]
        num_actual_results = len(evaluation_set[str(actual_query_label)])
        # print(actual_query_label)
        # print("------------")

        es = Elasticsearch(server_url)
        index_name = 'face_off_' + str(num_groups) + 'groups_' + str(
            num_clusters) + 'clusters_vgg'
        if not es.indices.exists(
                index_name
        ):  # if data is not indexed, create index and take data to ES
            # then query
            data_encoder = DataEncoder(num_groups, num_clusters, 1000,
                                       training_embedding_vectors,
                                       'encode_results_vgg')
            data_encoder.run_encode_data()
            json_string_tokens_generator = JsonStringTokenGenerator(
                'encode_results_vgg', 'PCA_2048_to_512_new.npy',
                'vn_celeb_face_recognition/train.csv', num_groups,
                num_clusters)
            encoded_string_tokens_list = json_string_tokens_generator.get_string_tokens_list(
            )
            train_embs = json_string_tokens_generator.get_image_fetures()
            train_labels, image_names = json_string_tokens_generator.get_image_metadata(
            )
            json_string_tokens_list = json_string_tokens_generator.generate_json_string_tokens_list(
                encoded_string_tokens_list, train_labels, image_names,
                train_embs)
            json_string_tokens_generator.save_json_string_tokens(
                json_string_tokens_list)

            print('saving completed....')
            print('******************************')
            indexer = ESIndexer('encode_results_vgg', num_groups, num_clusters,
                                server_url, 'vgg')
            indexer.index()

            start_time = datetime.now()
            searcher = Searcher(threshold, num_groups, num_clusters,
                                query_vector, server_url, index_name,
                                dist_function_name, 'vgg')
            results = searcher.search()
            # print(len(results))
            if len(results) == 0: continue
            search_time = datetime.now() - start_time
            search_time_in_ms = (search_time.days * 24 * 60 * 60 +
                                 search_time.seconds) * 1000 + \
                                search_time.microseconds / 1000.0
            search_times.append(search_time_in_ms)
        else:  # if not, commit query
            start_time = datetime.now()
            searcher = Searcher(threshold, num_groups, num_clusters,
                                query_vector, server_url, index_name,
                                dist_function_name, 'vgg')
            results = searcher.search()
            # print(len(results))
            if len(results) == 0: continue
            search_time = datetime.now() - start_time
            search_time_in_ms = (search_time.days * 24 * 60 * 60 +
                                 search_time.seconds) * 1000 + \
                                search_time.microseconds / 1000.0
            search_times.append(search_time_in_ms)

        results_labels = list()
        for result in results:
            results_labels.append(result['id'])

        # with open('evaluation_set.json', 'r') as fh:
        #     evaluation_set_dict = json.load(fh)
        #     fh.close()

        accuracy_i = 0
        for i in range(len(results)):
            step_list = results_labels[:(i + 1)]
            num_corrects = len([
                i for i, x in enumerate(step_list) if x == actual_query_label
            ])
            accuracy_i += num_corrects / len(step_list)
        # print(accuracy_i/num_returns)
        mean_average_accuracy += accuracy_i / len(results)

        recall_i = num_corrects / num_actual_results
        # print(num_corrects)
        mean_recall += recall_i

        # print("*************************************")

    mean_average_accuracy = mean_average_accuracy / num_queries
    mean_recall = mean_recall / num_queries
    print(mean_average_accuracy, mean_recall)
    # print("precision: {} and recall: {}".format(mean_average_accuracy, mean_recall))
    # print(average_search_time)
    # print(mean_average_accuracy)

    return 3 - mean_average_accuracy - mean_recall - (
        2 * mean_average_accuracy * mean_recall /
        (mean_average_accuracy + mean_recall))
Пример #39
0
from search import Searcher
from flask import Flask, request, render_template

logging.basicConfig(filename="runtime.log", level=logging.INFO)
root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                              datefmt="%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)
root.addHandler(handler)

searcher = Searcher()
app = Flask(__name__)

@app.route('/')
def hello():
    return render_template("main.html")

@app.route('/search', methods=['POST'])
def query():
    method = request.form.get("method")
    query = request.form.get("query")
    matches = searcher[method].search(query)
    return render_template("main.html", matches=matches)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5000)
Пример #40
0
def api_search():
    query = request.args['query']
    field = request.args['field']
    searcher = Searcher()
    result = searcher.search(query, field)
    return jsonify(result)
Пример #41
0
ap = argparse.ArgumentParser()
ap.add_argument("-q", "--query", required=True, help="Path to input image")
arg = vars(ap.parse_args())

f = open('dictionary.txt', 'r')
dataset = cPickle.loads(f.read())

queryImage = cv2.imread(arg["query"])
cv2.imshow("QueryImage", queryImage)
print "Query :: %s" % (arg["query"][arg["query"].rfind('/') + 1:])

rgbHist = RGBHist([8, 8, 8])
queryHist = rgbHist.getHist(queryImage)

searcher = Searcher(dataset)
results = searcher.search(queryHist)

set1 = np.zeros((150 * 5, 400, 3), dtype='unit8')
set2 = np.zeros((150 * 5, 400, 3), dtype='unit8')

for i in xrange(0, 10):
    (fileName, dist) = results[i]
    print "Result %d :: %s, Score :: %f" % (i, fileName, dist)
    path = './dataset/' + fileName
    image = cv2.imread(path)
    if i < 5:
        set1[150 * i:150 * (i + 1), :, :] = image
    else:
        set2[150 * (i - 5):150 * (i - 4), :, :] = image
Пример #42
0
def home():
    searcher = Searcher()
    movies, tvs = searcher.default_display()
    return render_template("index.html",
                           movie_videos=movies["videos"],
                           tv_videos=tvs["videos"])
Пример #43
0
'''from preprocessing import Preprocessor

preprocessor = Preprocessor(word_tokenize=True, remove_stopword=False, extract_entity=True, num_query=2)

print(preprocessor.transform("Hom qua em den truong tai Ha Noi, me dat tay tung buoc den Sai Gon"))
print(preprocessor.entities)'''
from os import listdir
from search import Searcher
searcher = Searcher()
fields = {'id': False, 'title': True, 'content': True, 'out': False}

docs = []
for fname in listdir('./data/folders/1001 bí ẩn/'):
    item = {
        'id': len(docs) + 1,
        'title': fname[:-4],
        'content': open('./data/folders/1001 bí ẩn/' + fname).read(),
        'out': fname[:-4]
    }
    docs.append(item)

searcher.set_fields(fields)
searcher.fit(docs[:10])
for i in range(10):
    print(docs[i]['title'])
i = 10
while True:
    s = input('Already to test: ')
    if s == 'add':
        print(docs[i]['title'])
        searcher.add_document(docs[i])
Пример #44
0
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.miscellaneous import PerFieldAnalyzerWrapper
from java.util import HashMap

from lxml import etree
from search import Searcher
from index import Indexer, CustomAnalyzer

INDEX_DIR = 'index'
# DATA_DIR = 'data/dblp.xml'
DATA_DIR = 'data/dblp_small.xml'

if __name__ == "__main__":
    # user inputs
    topN = 10

    lucene.initVM()

    # index documents
    config = {'lowercase': True, 'stemming': True, 'stopwords': True}
    title_analyzer = CustomAnalyzer(config)
    per_field = HashMap()
    per_field.put("title", title_analyzer)
    analyzer = PerFieldAnalyzerWrapper(
                StandardAnalyzer(Version.LUCENE_CURRENT), per_field)
    Indexer(DATA_DIR, INDEX_DIR, context, analyzer)
    searcher = Searcher(INDEX_DIR, analyzer)
    # # q = raw_input("Query: ")
    # # searcher.search(q, N=topN)
    searcher.run(topN)
Пример #45
0
import argparse
import cPickle
import cv2

# Construct argument parser using argparse
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
                help="Path to indexed image dataset")
ap.add_argument("-i", "--index", required=True,
                help="Path to index file")

args = vars(ap.parse_args())

# Load index  and initialize our searcher
index = cPickle.loads(open(args["index"]).read())
searcher = Searcher(index)

# loop over images in the index -- we will use each one as
# a query image
for (query, queryfeatures) in index.items():
    # perform the search using the current query
    results = searcher.search(queryfeatures)

    # load the query image and display it
    path = args["dataset"] + "/%s" %(query)
    queryImage = cv2.imread(path)
    cv2.imshow("Query", queryImage)
    print "query: %s" %(query)

    # intialize the 2 montages to display our results --
    # we have a total of 25 iimages in the index, but let's only
Пример #46
0
class ResultsGenerator:
    def __init__(self, index_dir):
        self.searcher = Searcher(index_dir)

    def get_id_section(self, request):
        idList = list()
        for i in range(len(request)):
            hitDoc = self.searcher.searcher.doc(request[i].doc)
            idList.append(hitDoc.get("id_section"))
        return idList

    def process(self, input_file, index_dir, output_dir):
        output_file_1 = open(output_dir + "/results_BM25_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_2 = open(output_dir + "/results_BM25_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_3 = open(output_dir + "/results_BM25_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_4 = open(output_dir + "/results_BM25_4.txt",
                             'a+',
                             encoding="utf-8")
        output_file_5 = open(output_dir + "/results_VSM_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_6 = open(output_dir + "/results_VSM_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_7 = open(output_dir + "/results_VSM_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_8 = open(output_dir + "/results_VSM_4.txt",
                             'a+',
                             encoding="utf-8")

        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    for q in par["qas"]:
                        num_lines += 1

        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    title = p["title"]
                    for par in p['paragraphs']:
                        for q in par["qas"]:
                            pbar.update(1)
                            if q["is_impossible"] is False:
                                question_content_s_BM25 = self.searcher.simpleSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_s_BM25 = self.get_id_section(
                                    question_content_s_BM25)

                                question_title_content_s_BM25 = self.searcher.pairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_s_BM25 = self.get_id_section(
                                    question_title_content_s_BM25)

                                question_content_m_BM25 = self.searcher.multiFieldsSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_m_BM25 = self.get_id_section(
                                    question_content_m_BM25)

                                question_title_content_m_BM25 = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_m_BM25 = self.get_id_section(
                                    question_title_content_m_BM25)

                                question_content_s_TDF = self.searcher.simpleSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_s_TDF = self.get_id_section(
                                    question_content_s_TDF)

                                question_title_content_s_TDF = self.searcher.pairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_s_TDF = self.get_id_section(
                                    question_title_content_s_TDF)

                                question_content_m_TDF = self.searcher.multiFieldsSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_m_TDF = self.get_id_section(
                                    question_content_m_TDF)

                                question_title_content_m_TDF = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_m_TDF = self.get_id_section(
                                    question_title_content_m_TDF)

                                for i in range(len(question_content_s_BM25)):
                                    output_file_1.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_BM25)):
                                    output_file_2.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_s_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_BM25)):
                                    output_file_3.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_BM25)):
                                    output_file_4.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_m_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_s_TDF)):
                                    output_file_5.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_TDF)):
                                    output_file_6.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_s_TDF[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_TDF)):
                                    output_file_7.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_TDF)):
                                    output_file_8.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_m_TDF[i].
                                            score) + " STANDARD\n")

        print("==> Results successfully created.\n")
Пример #47
0
class ImageSearcher():
    '''Image searcher API for clothes retrieval web demo'''
    def __init__(self):
        root_path = os.path.dirname(__file__)
        inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
        feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
        self.searcher = Searcher(inds_path, feature_path)
        self.local_features = np.load('db/local_features.npy')

    def search(self, image_path, do_detection=1, k=10): 
        #queryImage = cv2.imread(image_path)
        t1 = Timer()
        t1.tic()
        #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False)
        queryFeatures = descriptor.get_descriptor(image_path)
        
        t1.toc('Feature Extraction time: ')
        t2 = Timer()
        t2.tic()
        #p = Profile()
        #results = p.runcall(self.searcher.search, queryFeatures)
        #p.print_stats()
        results, dists, ind = self.searcher.search(queryFeatures,k=5*k)
        #self.reranking(queryFeatures, results, dists, ind, 0.6)
        #self.queryExpansion2(results, dists, ind)
        #self.queryExpansion(queryFeatures, results, dists, ind, top=3)
        t2.toc('Knn search time: ')
        result = []
        # origine image
        #result.append(image_path)
        dist = []
        for j,imageName in enumerate(results):
            if imageName not in result:
                result.append(imageName)
                dist.append(dists[j])
        #print result[:k]
        return result[:k],dist[:k]

    def reranking(self, queryFeatures, results, dists, ind, rerank_thresh=0.7):
        features = self.local_features
        feature = []
        flag = 0
        dist = 0
        res = []
        for i,index in enumerate(ind):
            if dists[i] < rerank_thresh:
                flag += 1
            else:
                if dist == 0:
                    dist = dists[i-1]
                feature.append(features[index])
                res.append(results[i])
        if len(feature) < 3:
            return
        feature = np.array(feature).copy()
        result,new_ind = self.searcher.research(res, queryFeatures, feature, 3)
        for j,r in enumerate(result):
            results.insert(flag+j, r)
            dists.insert(flag+j, dist)

    def queryExpansion2(self, results, dists, ind, threshold=0.3, k=10, top=3):
        features = self.searcher.features
        feature = []
        for i in xrange(top):
            query = features[ind[i]]
            if dists[i] > threshold:
                break
            new_result, new_dist, new_ind = self.searcher.search(query,k=k)
            for j,dist in enumerate(new_dist):
                if dist > threshold:
                    break
                for k,d in enumerate(dists[i:]):
                    if dist < d:
                        results.insert(i+k, new_result[j])
                        dists.insert(i+k, dist)
                        break

    def queryExpansion(self, queryFeatures, results, dists, ind, threshold=0.8, k=10, top=5):
        """
        Do Query Expansion with at most top
        """
        features = self.searcher.features
        feature = []
        #feature.append(queryFeatures)
        for i,dist in enumerate(dists):
            #if dist < threshold and i < top:
            if i < top:
                feature.append(features[ind[i]])
        if len(feature) == 0:
            return 0
        query = np.mean(np.array(feature), axis=0)
        new_results, new_dists, new_ind = self.searcher.search(query,k=k)
        for i,dist in enumerate(new_dists):
            if dist > dists[-1]:
                break
            for j,d in enumerate(dists):
                if dist < d:
                    results.insert(j, new_results[i])
                    dists.insert(j, dist)
                    break 
Пример #48
0
import urllib
import json

app = Flask(__name__, static_url_path='')


@app.route("/")
def main():
    return send_from_directory('static', 'search.html')


@app.route("/q/<input>")
@crossdomain(origin='*')
def search(input):
    print urllib.unquote_plus(input).encode('utf-8')
    doclist = searcher.search(input)

    result = []
    for doc in doclist:
        result.append(str(doc.id) + ", " + str(doc.name) + "<br>" + doc.text)

    return json.dumps(result)


index = Indexer("docs.txt")
searcher = Searcher(index)

if __name__ == "__main__":
    app.run(host='127.0.0.1', port=8282, debug=True)
Пример #49
0
 def __init__(self):
     root_path = os.path.dirname(__file__)
     inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
     feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
     self.searcher = Searcher(inds_path, feature_path)
     self.local_features = np.load('db/local_features.npy')
class ResultsGenerator:
    def __init__(self, index_dir):
        self.searcher = Searcher(index_dir)

    def get_id_section(self, request):
        idList = list()
        for i in range(len(request)):
            hitDoc = self.searcher.searcher.doc(request[i].doc)
            idList.append(hitDoc.get("id_section"))
        return idList

    def process(self, input_file, index_dir, output_dir):
        output_file_1 = open(
            output_dir + "/results.txt", 'a+', encoding="utf-8")

        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    for q in par["qas"]:
                        num_lines += 1

        model = create_model()
        model.load_weights("5e-5 0.1.h5")

        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    for par in p['paragraphs']:
                        for q in par["qas"]:
                            pbar.update(1)
                            if q["is_impossible"] is False:
                                result = self.searcher.simpleSearch(q["question"], BM25Similarity())
                                ids = []
                                if(result == []):
                                    output_file_1.write('"'+str(q['id'])+'": "",\n')
                                    continue

                                content = ""
                                tab = ['']
                                tab.append(q["question"])
                                tab.pop(0)

                                for i in range(len(result)):
                                    hitDoc = self.searcher.searcher.doc(result[i].doc)
                                    content = hitDoc.get("content_section")
                                    tab.append(str(content))
                                    ids.append(hitDoc.get("id_section"))

                                inputs = []
                                for i in range(1, len(tab)):
                                    inputs.append([tab[0],tab[i]])
                            
                                # tokenization
                                squad_examples = []

                                for i in inputs:
                                    question = i[0]
                                    context = i[1]
                                    squad_eg = Example(q["question"], context)
                                    squad_eg.preprocess()
                                    squad_examples.append(squad_eg)

                                dataset_dict = {"input_ids": [],
                                                "token_type_ids": [],
                                                "attention_mask": [],}
                                for item in squad_examples:
                                    if item.skip == False:
                                        for key in dataset_dict:
                                            dataset_dict[key].append(getattr(item, key))
                                for key in dataset_dict:
                                    dataset_dict[key] = np.array(dataset_dict[key])

                                x = [dataset_dict["input_ids"],
                                    dataset_dict["token_type_ids"],
                                    dataset_dict["attention_mask"]]

                                y_pred = model.predict(x)
                                
                                sorted_indexes = sorted(range(len(y_pred)), key=lambda k: y_pred[k], reverse=True)
                                
                                r = 1
                                for i in sorted_indexes:
                                    output_file_1.write(
                                        q["id"] + " Q0 " + str(ids[i]) + " " + str(r) + " " + str(y_pred[i][0]) + " STANDARD\n")
                                    r += 1
        print("==> Results successfully created.\n")
Пример #51
0
    output_attentions=False,
    output_hidden_states=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
modelClassifier.to(device)
modelClassifier.load_state_dict(torch.load(
    '/Users/younesagabi/Desktop/YouTaQA/DeepLearning/Classifier/Models/BERT_ft_epoch10.model',
    map_location=torch.device(device)),
                                strict=False)
modelExtractor = BertForQuestionAnswering.from_pretrained(
    'bert-large-uncased-whole-word-masking-finetuned-squad')

txt_file = open(r"txt_file.txt", "w+")
tsv_file = open("test.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")
searchObject = Searcher(
    "/Users/younesagabi/Desktop/YouTaQA/IR/index_wiki_v7.0")
Similarity = Similarity()
for row in read_tsv:
    inputQuery = row[0]
    result = searchObject.multiFieldsSearch(inputQuery, BM25Similarity())
    # print(result)
    # print("#" * 100)
    # print("#" * 100)
    content = ""
    list = ['']
    list.append(inputQuery)
    list.pop(0)
    j = 0
    for i in range(len(result)):
        hitDoc = searchObject.searcher.doc(result[i].doc)
        score = result[i].score
Пример #52
0
from generator import Generator
from search import Searcher
from connectionist import Connectionist
from vertex import Vertex

n_vertices = 10  # number of elements/nodes
g = Generator(n_vertices)
searcher = Searcher()
connector = Connectionist()

n = 20  # number of runs
for i in range(n):
    g.generate()
    belief_network = g.get_belief_network()
    neural_network = g.get_neural_network()

    coherence, (true, false) = searcher.run(belief_network)
    print 'coherence search:', coherence
    print 'accepted propositions:', sorted(true, key=lambda v: v.n)
    print 'rejected propositions:', sorted(false, key=lambda v: v.n)
    print '-----------------------------------------------'

    activations, harmony = connector.run(neural_network)
    print 'harmony', harmony
    true = []
    false = []
    for i, a in enumerate(activations):
        if a == 1:
            true.append(Vertex(i))
        else:
            false.append(Vertex(i))
Пример #53
0
def main(_):
    # Import config
    import yaml

    # Configure logger
    logger = logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)
    
    # Load NLP libraries
    logger.info('Loading NLP library')
    import spacy
    from nltk.corpus import stopwords
    nlp = spacy.load('en')
    STOP_WORDS = set(stopwords.words('english'))

    # Parse search phrase
    search_input = FLAGS.search_phrase
    search_phrase = nlp(' '.join([word for word in search_input.split(' ') if word not in STOP_WORDS]))
    logger.info('Search phrase: "%s"' % search_phrase.text)

    results = []

    # Required for model
    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(configuration.ModelConfig(), FLAGS.model_file)
    g.finalize()
    vocab = vocabulary.Vocabulary(FLAGS.vocab_file)

    # Find files to search
    search_dir = FLAGS.base_dir if FLAGS.base_dir is not None else os.path.dirname(os.path.abspath(__file__))
    files = Searcher.search_from_dir(search_dir)
    num_files = len(files)
    logger.info('%d file(s) found' % num_files)

    with tf.Session(graph=g) as sess:
        # Load the model from checkpoint and instantiate caption generator model.
        restore_fn(sess)
        generator = caption_generator.CaptionGenerator(model, vocab)

        # Caption the files
        count = 0
        for file_path in files:
            count+=1.
            try:
                with tf.gfile.GFile(file_path, "r") as f:
                  image = f.read()
                captions = generator.beam_search(sess, image)
                logger.info("Captioning image %f: %s" % (count/num_files,file_path))
                best_caption = captions[0] # Just take the most probable caption
                sentence = nlp(" ".join([vocab.id_to_word(word) for word in best_caption.sentence[1:-1] if word not in STOP_WORDS]))
                results.append((file_path, sentence.text, search_phrase.similarity(sentence)))
            except Exception as e:
                logger.warning('Failed to caption image: %s' % file_path)
                  

        render_results(search_phrase.text, sorted(results, key= lambda x : x[2], reverse=True))

        webbrowser.open('output.html',new=2)
Пример #54
0
from java.util import HashMap

from search import Searcher
from index import CustomAnalyzer
from utils import check_config

CONFIG_DIR = 'config.json'
INDEX_DIR = 'index'
DATA_DIR = 'data/dblp.xml'

# run search on command line
# see ui_search.py to use the search via web UI
if __name__ == "__main__":
    with open(CONFIG_DIR) as f:
        config = json.load(f)
    config = check_config(config)

    lucene.initVM()  # start JVM for Lucene

    # index documents
    # use different analyzer for title field
    title_analyzer = CustomAnalyzer(config['titleAnalyzer'])
    per_field = HashMap()
    per_field.put("title", title_analyzer)
    analyzer = PerFieldAnalyzerWrapper(
                StandardAnalyzer(Version.LUCENE_CURRENT), per_field)
    searcher = Searcher(INDEX_DIR, analyzer)
    # q = raw_input("Query: ")
    # searcher.search(q, N=config['topN'])
    searcher.run(config['topN'])
Пример #55
0
from cspProblemDefine import CSP, Constraint, ne_, is_ 
from operator import lt,ne,eq,gt
from search import Search_from_CSP, Searcher

def meet_at(p1,p2):
    """returns a function that is true when the words meet at the postions p1, p2
    """
    def meets(w1,w2):
        return w1[p1] == w2[p2]
    meets.__name__ = "meet_at("+str(p1)+','+str(p2)+')'
    return meets

crossword1 = CSP({'one_across':{'ant', 'bus', 'car', 'has'},
                  'one_down':{'buys', 'hold', 'lane', 'year'},
                  'three_across':{'buys', 'hold', 'lane', 'year'},
                  'two_down':{'search', 'syntax'},
                  'four_across':{'ant', 'bus', 'car', 'has'}
                  },
                  [Constraint(('one_across','one_down'),meet_at(0,0)),
                   Constraint(('one_down','three_across'),meet_at(2,0)),
                   Constraint(('one_across','two_down'),meet_at(2,0)),
                   Constraint(('three_across','two_down'),meet_at(2,2)),
                   Constraint(('four_across','two_down'),meet_at(0,4))
                   ])
    
searcher3 = Searcher(Search_from_CSP(crossword1))
print('The first solution searched is:')
print(searcher3.search())

Пример #56
0
 def __init__(self):
     root_path = os.path.dirname(__file__)
     tree_path = os.path.abspath(os.path.join(root_path, 'db/tree'))
     inds_path = os.path.abspath(os.path.join(root_path, 'db/index'))
     feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy'))
     self.searcher = Searcher(tree_path, inds_path, feature_path)
Пример #57
0
 def __init__(self, index_dir, analyzer):
     self.searcher = Searcher(index_dir, analyzer)