def _search_pharse_func_tester(pharse, doc_id):
    terms = []
    t_st = Token_Preprocessing_Engine()
    for token in pharse.split():
        terms.append(t_st.process_token(token))
    result = search_pharse(terms, doc_id)
    send_stdout(result)
def main():
    global st
    # read arguments
    args = parse_arguments()

    # get filenames from the [document dir]
    try:
        doc_files = [
            f for f in listdir(args.doc_dir) if isfile(join(args.doc_dir, f))
        ]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(
            args.doc_dir))
        return
    # check whether the index file for zone scoring already exist
    if isfile(join(args.index_dir, ZONE_INDEX_FILE)):
        send_stdout('Error! Index file "{}" already exist.'.\
            format(join(args.index_dir, ZONE_INDEX_FILE)))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # read directory -> read doc -> create zone indexes
    read_dir(args.doc_dir, doc_files)

    # write index to file
    f_out = open(join(args.index_dir, ZONE_INDEX_FILE), 'w')
    for term in sorted(zone_index.keys()):
        f_out.write('{term} {posting}\n'.format(term=term,
                                                posting=zone_index[term]))
    f_out.close()
Exemplo n.º 3
0
def main():
    global st

    # read arguments "% ./create_lms [document dir] [output_dir]"
    if len(sys.argv) != 3:
        send_stdout("Usage: python3 {} [document_dir] [output_dir]".format(
            sys.argv[0]))
        return
    # get filenames from the [document dir]
    try:
        DOC_DIR = sys.argv[1]
        docs = [f for f in listdir(DOC_DIR) if isfile(join(DOC_DIR, f))]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(DOC_DIR))
        return
    # check whether the index file already exist in the [output_dir]
    LM_FILE = join(sys.argv[2], LM_NAME)
    if isfile(LM_FILE):
        send_stdout('Error! LM file "{}" already exist.'.format(LM_FILE))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    skipped_docs = []
    invalid_filename_docs = []
    f_num = len(docs)
    for i in range(f_num):
        fname = docs[i]
        success, docID = filename_validation(fname)
        if not success:
            invalid_filename_docs.append(fname)
            continue
        try:
            # read file, and create language models (calculate MLE)
            read_file(join(DOC_DIR, fname), docID)
        except Exception as e:
            skipped_docs.append(fname)
            continue
        # update progress bar
        progress(i + 1, f_num)

    send_stdout()
    # show invalid document name/format to stdout
    if len(invalid_filename_docs) != 0:
        send_stdout('Warning! Invalid document name format:')
        send_stdout('{}, Skipped.'.format(invalid_filename_docs))
    if len(skipped_docs) != 0:
        send_stdout('Warning! Cannot process the following doc(s):')
        send_stdout('{}, Skipped.'.format(skipped_docs))

    # write index to file
    f_out = open(LM_FILE, 'w')
    f_out.write(str(LM_LMS))
    f_out.close()
def main():
    global st, documents

    # read arguments
    args = parse_arguments()

    # query validation
    if not validate_query(args.q):
        send_stdout('Error! Invalided boolean query.')
        sys.exit()

    # open index file
    try:
        path = join(args.index_dir, ZONE_INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Zone index file "{}" does not exits.'.format(path))
        sys.exit()

    # read index
    send_stdout("Reading zone index ...")
    try:
        read_index(f)
    except Exception as e:
        print(e)
        send_stdout('Error! Invalided zone index file format.')
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # query preprocessing
    p_query = preprocessing_query(args.q)
    # parse query
    lisp_bool_query = str(searchExpr.parseString(p_query)[0])
    send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query))

    # find document that satisfied the boolean query
    send_stdout("Searching and scoring ...")
    result = {}
    for doc_id in documents:
        score = 0
        if query_valuation(lisp_bool_query, doc_id, TITLE):
            score += 1 * args.g
        if query_valuation(lisp_bool_query, doc_id, BODY):
            score += 1 * (1 - args.g)
        result[doc_id] = score
    k_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
    for i in range(min(args.k, len(k_result))):
        d, s = k_result[i]
        send_stdout('{id} \t {score}'.format(id=d, score=s))

    f.close()
Exemplo n.º 5
0
def main():
    global st
    # read arguments
    if len(sys.argv) != 2:
        send_stdout("format: python {} [dir]".format(sys.argv[0]))
        return
    # get filenames from the [dir]
    try:
        path = sys.argv[1]
        files = [f for f in listdir(path) if isfile(join(path, f))]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(path))
        return
    # check whether the index file already exist
    if isfile(INDEX_FILE):
        send_stdout('Error! Index file "{}" already exist.'.format(INDEX_FILE))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    skipped_files = []
    f_num = len(files)
    for i in range(f_num):
        fname = files[i]
        finfo = fname.split(sep='_', maxsplit=2)
        # filename validation
        if finfo[0] != 'doc':
            skipped_files.append(fname)
            continue
        try:
            # read file, and create indexes
            read_file(join(path, fname), int(finfo[1]))
        except Exception as e:
            skipped_files.append(fname)
            continue
        # update progress bar
        progress(i + 1, f_num)

    send_stdout()
    if len(skipped_files) != 0:
        send_stdout('Warning! Cannot index the following file(s):')
        send_stdout('{}, Skipped.'.format(skipped_files))

    # write index to file
    f_out = open(INDEX_FILE, 'w')
    for term in sorted(positional_index.keys()):
        f_out.write('{term} {index}\n'.format(term=term,
                                              index=positional_index[term]))
    f_out.close()
def process_query(query):
    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()
    # process query
    terms = []
    for token in query.split():
        # Stemming and Lowercasing
        if STEMMER:
            t = st.process_token(token)
        else:
            t = token.lower()
        terms.append(t)
    return terms
def main():
    global st, documents

    # read arguments
    args = parse_arguments()

    # query validation
    if not validate_query(args.query):
        send_stdout('Error! Invalided boolean query.')
        sys.exit()

    # open index file
    try:
        path = join(args.path, INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Index file "{}" does not exits.'.format(path))
        sys.exit()

    # read index
    try:
        read_index(f)
    except:
        send_stdout('Error! Invalided index file format.')
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # query preprocessing
    p_query = preprocessing_query(args.query)
    # parse query
    lisp_bool_query = str(searchExpr.parseString(p_query)[0])
    send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query))

    # find document that satisfied the boolean query
    result = []
    for doc_id in documents:
        if query_valuation(lisp_bool_query, doc_id):
            result.append(doc_id)
    send_stdout("Documents: {}.".format(result))

    f.close()
def main():
    # read arguments
    args = parse_arguments()
    if args.score not in ['y', 'n']:
        send_stdout('Error! arg "scores" should be either y or n')
        sys.exit()

    # open index file
    try:
        path = join(args.path, INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Index file "{}" does not exits.'.format(path))
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()
        query = [st.process_token(t) for t in args.terms]
    else:
        query = [t.lower() for t in args.terms]

    # read index
    try:
        read_index(f)
    except:
        send_stdout('Error! Invalided index file format.')
        sys.exit()

    # compute vector space scores
    score = cosine_score(query)
    k_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
    for i in range(min(args.k, len(k_score))):
        d, s = k_score[i]
        if args.score == 'y':
            send_stdout('{id} \t {score}'.format(id=d, score=s))
        else:
            send_stdout('{id}'.format(id=d))

    f.close()