Пример #1
0

if __name__ == "__main__":

    args = utils.parse_args()

    if args.fib:
        archiver = fib_archive.FibonacciArchiver(args.fib)
    elif args.s9:
        archiver = s9_archive.Simple9Archiver()

    morph = pymorphy2.MorphAnalyzer()
    bs = BooleanSearch(args.ndx_name, args.bin_name, archiver)

    with open(args.url_name, "r") as f_urls:
        urls = map(lambda line: utils.norm_url(line.split()[1]), f_urls)

    if not args.mrk_name:

        while True:

            print "query=",
            if sys.platform.startswith("win"):
                query = unicode(sys.stdin.readline(), "cp866")
            else:
                reload(sys)
                sys.setdefaultencoding("utf-8")
                query = unicode(sys.stdin.readline())

            if u"exit" in query:
                break
Пример #2
0
def main():
    args = utils.parse_args()

    if  args.fib:
        archiver = fib_archive.FibonacciArchiver(args.fib)
    elif args.s9:
        archiver =  s9_archive.Simple9Archiver()

    if  not os.path.exists(args.url_name + 'urls.txt'):
        with open(args.url_name, 'r') as f_urls:
            urls = map(lambda line: utils.norm_url(line.strip().split()[1]), f_urls)
    
        with open(args.url_name + 'urls.txt', 'w') as f:
            print >>f, '\n'.join(urls)
    else:
        with open(args.url_name + 'urls.txt', 'r') as f:
            urls = f.read().split('\n')
        
    bs = BooleanSearch(args.ndx_name, args.bin_name, archiver)     
    br = BlackSearch(bs, lex=utils.MyLex(), dlen_name=args.len_name)
    ts = TextSearch(br)
    
    IS_IN = 0
    iter = 0

    f_ranks = codecs.open(args.rnk_name, 'w', encoding='utf-8')
    
    with codecs.open('mark_ids.txt', 'r', encoding='utf-8') as f_marks: # args.mrk_name
        with codecs.open('params.txt', 'w', encoding='utf-8') as f_params:

            # L_marks = f_marks.readlines()
            # ts.params = [ p / len(L_marks) for p in ts.params ]
            # print ts.params

            total_time = time.time()
            for it,line in enumerate(f_marks): # L_marks):

                splt = line.split('\t')
                if len(splt) != 2: continue

                # query, mark_url = splt[0], utils.norm_url(splt[1])
                query, mark_id = splt[0], int(splt[1])

                print "%3d '%s'" % (it, query.encode('cp866', 'ignore')),
                print >>f_ranks, "%3d '%s'" % (it, query)

                inv_q = ts.br.lex.incorrect_keyboard_layout(query)
                if inv_q:
                    print "\n    '%s'" % inv_q,
                    print >>f_ranks, "'%s'" % inv_q

                start_time = time.time()
                answer = ts.search(query, 1000, mark_id=mark_id, f_params=f_params)
                print "\t%.3f sec." % (time.time() - start_time),
                print >>f_ranks, "%.3f sec.\n" % (time.time() - start_time)

                # answer_urls = [ urls[i] for i in answer ]
                # found_urls = '\n'.join(answer_urls)
                # print >>f_ranks, mark_id, '=', ','.join([str(a) for a in answer])

                try:
                    z = answer.index(mark_id)
                    print '\tOK(%d)' % z
                    print >>f_ranks, '\tOK(%d)' % z
                    IS_IN += 1
                except:
                    print '\t---'
                    print >>f_ranks, '---\n\n'

            print "\n%.3f sec." % (time.time() - total_time)
            print >>f_ranks, "\n%.3f sec.\n" % (time.time() - total_time)

            print "IS_IN %d" % IS_IN
            print >>f_ranks, "IS_IN %d" % IS_IN