reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ").replace(
        "<?xml version=\"1.0\" ?>\n", "")


logging.basicConfig(filename="logs/parse_xml_parameters4indri.log",
                    level=logging.DEBUG)

if __name__ == '__main__':
    config_file = sys.argv[1]
    config = json.load(open(config_file))
    logging.info('Config: ' + json.dumps(config, indent=2))
    out = config["output"]

    # process queries:
    queries = extract_trec_million_queries(config["queries"])
    queries_text = {}
    stoplist = set(stopwords.words("english")) if config["stopwords"] else {}
    qrels_MQ = []  # get only judged queries
    for file in os.listdir(config["qrels_MQ"]):
        qrels_MQ += list(read_values(os.path.join(config["qrels_MQ"], file),
                                     0))
    qrels_MQ = set(qrels_MQ)
    q_times = defaultdict(int)
    print("Pre-process queries %d queries..." % len(qrels_MQ))
    logging.info("Pre-process queries %d queries..." % len(qrels_MQ))
    for q in tqdm(qrels_MQ):
        q_text = clean(queries[q], config["stemmer"], stoplist)
        q_times[q_text] += 1  # queries with duplicate content
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
            [q_text, str(q_times[q_text])])
示例#2
0
from os.path import join


if __name__ == '__main__':
    print("Split data into folds for training,tast and validation")
    config_file = sys.argv[1]
    config_all = json.load(open(config_file))
    config = config_all["parameters"]
    print('Config: '+json.dumps(config, indent=2))

    if config["only_queries"] or config["only_docs"]:
        to_split = []

        if config["only_queries"]:
            queries = extractTopics(config["queries_folder"]) if config["queries_format"] == "trec" \
                else extract_trec_million_queries(config["queries_folder"])
            to_split = list(queries.keys())
        elif config["only_docs"]:
            index = pyndri.Index(config["index"])
            docs_list = [doc for doc in range(index.document_base(), index.maximum_document())]
            external_doc_id = [index.document(doc)[0] for doc in docs_list]
            to_split = external_doc_id

        folds = {}
        random.shuffle(to_split)
        split = chunkIt(to_split, config["folds_num"])

        if config["validation"]:
            for i in tqdm(range(config["folds_num"])):
                    # print("fold ",i, end="\t")
                    test = split[i]
    print("Data extraction\nConfiguration: ")
    print(json.dumps(config, indent=2), end='\n')

    print("Reading index ...")
    index = pyndri.Index(config["indexed_data"])
    _, id2token, _ = index.get_dictionary()
    externalDocId = {}
    for doc in range(index.document_base(), index.maximum_document()):
        extD, _ = index.document(doc)
        externalDocId[extD] = doc
    print("Extract queries ...")
    queries = {}
    if config["train_queries"] == config["test_queries"]:
        queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec"\
            else extract_trec_million_queries(config["train_queries"])
    else:
        train_queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec" \
            else extract_trec_million_queries(config["train_queries"])
        test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \
            else extract_trec_million_queries(config["test_queries"])
        queries = {**train_queries, **test_queries}
    print("{n} queries to process.".format(n=len(queries)))

    queries_text = {}
    q_times = defaultdict(int)
    print("Preprocess queries ...")
    for q in tqdm(queries):
        q_text = clean(queries[q], "krovetz", {})
        q_times[q_text] += 1
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
示例#4
0
        Usage:
            construcrQueryFile.py <topics_folder> <collection_name> <outputfolder> [--tq | --mq]
            
        Options:
            --t    if topics_folder contains trec query files.
            --tm    if topics_folder contains trec_million query files.
        """)
    
    print("\nparameters : \n")
    print(args)

    topics = {}
    if args["--tq"]:
        topics = extract_trec_topics(args["<topics_folder>"])
    elif args["--mq"]:
        topics = extract_trec_million_queries(args["<topics_folder>"])
    else:
        print("No queries to extract")

    outputFile = open(join(args["<outputfolder>"],
                           "RetrievalParameterFile_{name}.xml".format(name=args["<collection_name>"])), 'w')
    outputFile.write("<parameters>\n")

    tokenizer = MosesTokenizer()

    prog = re.compile("[_\-\(]*([A-Z]\.(\ )*)*[_\-\(]*")
    tops = {}
    for top in topics:
        terms = topics[top].split()
        toptext = ""
        for t in terms: