Exemplo n.º 1
0
 def test_escape(self):
     self.assertEqual(pyndri.escape('hello (world)'),
                      'hello world')
     self.assertEqual(pyndri.escape('hello.world'),
                      'hello world')
     self.assertEqual(pyndri.escape('hello:world'),
                      'hello world')
Exemplo n.º 2
0
    def test_tokenize(self):
        self.assertEqual(pyndri.tokenize('hello world foo bar'),
                         ('hello', 'world', 'foo', 'bar'))

        self.assertEqual(pyndri.tokenize('hello-world'), ('hello', 'world'))

        self.assertEqual(pyndri.tokenize('hello.world'), ('hello', ))
        self.assertEqual(pyndri.tokenize(pyndri.escape('hello.world')),
                         ('hello', 'world'))

        self.assertEqual(pyndri.tokenize('hello "world"'), (
            'hello',
            'world',
        ))

        self.assertRaises(OSError, lambda: pyndri.tokenize('hello (world)'))

        self.assertEqual(pyndri.tokenize(pyndri.escape('hello \'world\'')), (
            'hello',
            'world',
        ))
        self.assertEqual(pyndri.tokenize(pyndri.escape('hello/world')), (
            'hello',
            'world',
        ))
Exemplo n.º 3
0
def search():
    index, dictionary = get_index()

    query_string = request.args.get('q', None)

    smoothing_method = request.args.get('smoothing_method', 'dirichlet')
    smoothing_param = float(request.args.get('smoothing_param', 1000))
    results_requested = int(request.args.get('results_requested', 10))

    documents = []

    if query_string is not None:
        logging.info('Query string: %s', query_string)

        highlighted_token_ids = set()

        if not query_string.startswith('docid:'):
            for token in index.tokenize(pyndri.escape(query_string)):
                if dictionary.has_token(token):
                    highlighted_token_ids.add(
                        dictionary.translate_token(token))

        def _include_document(int_doc_id):
            ext_doc_id, doc_token_ids = index.document(int_doc_id)

            def _format_token(token_id):
                term = dictionary[token_id]

                if token_id in highlighted_token_ids:
                    term = '<strong>{}</strong>'.format(term)

                return term

            doc_tokens = [
                _format_token(token_id) if token_id > 0 else '&lt;unk&gt;'
                for token_id in doc_token_ids
            ]

            documents.append((ext_doc_id, ' '.join(doc_tokens)))

        if query_string.startswith('docid:'):
            ext_document_id = query_string[6:]
            lookup = dict(index.document_ids([ext_document_id]))

            if lookup:
                _include_document(lookup[ext_document_id])
        else:
            query_env = pyndri.QueryEnvironment(index,
                                                rules=(build_smoothing_rule(
                                                    smoothing_method,
                                                    smoothing_param), ))

            results = query_env.query(query_string,
                                      results_requested=results_requested)

            for int_doc_id, _ in results:
                _include_document(int_doc_id)

    return render_template('index.html',
                           query=query_string,
                           results=documents,
                           smoothing_method=smoothing_method,
                           smoothing_param=smoothing_param)
Exemplo n.º 4
0
	outputFile = open(join(args["<outputfolder>"],"RetrievalParameterFile_{name}.xml".format(name=args["<collection_name>"])), 'w')
	outputFile.write("<parameters>\n")

	tokenizer=MosesTokenizer()

	prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*")
	tops = {}
	for top in topics:
		terms=topics[top].split()
		toptext=""
		for t in terms:
			if (prog.match(t)):
				t=t.replace('.','')
				toptext=toptext+" "+t
		toptext=escape(toptext)
		tops[top]=tokenizer.tokenize(toptext,return_str=True)

	topics = collections.OrderedDict(sorted(tops.items()))

	for t in topics :
		print("topic : {t}".format(t=t))
		outputFile.write(" <query>\n  <type>indri</type>\n")
		outputFile.write("  <number>{num}</number>\n".format(num=int(t)))
		outputFile.write("  <text>\n")
		outputFile.write("   {txt}\n".format(txt=topics[t]))
		outputFile.write("  </text>\n")
		outputFile.write(" </query>\n")

	outputFile.write("</parameters>")
	print("\nEnded.")