示例#1
0
is_training = args.is_training
is_short = args.is_short
is_spoken = args.is_spoken

path = CommonPath(is_training, is_short, is_spoken)
log_filename = path.getLogFilename()
qry_path = path.getQryPath()
doc_path = path.getDocPath()
rel_path = path.getRelPath()

dict_path = path.getDictPath()
bg_path = path.getBGPath()

print("Vector-Space-Model")
# read relevant set for queries and documents
eval_mdl = Evaluate.EvaluateModel(rel_path, is_training)
rel_set = eval_mdl.getAset()

# Preprocess for queries and documents
qry_file = ProcDoc.readFile(qry_path)
doc_file = ProcDoc.readFile(doc_path)

# Term Frequency
qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set)
doc_mdl_dict = ProcDoc.docPreproc(doc_file)

# Convert dictionary to numpy array (feasible to compute)
qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict)
doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict)

# TF-IDF
示例#2
0

def ID2Word(proc_dict, ID_map, score_dict):
    att_dict = defaultdict(list)
    for key, content in proc_dict.items():
        for i, ID in enumerate(content):
            content[i] = ID_map[ID]
            if ID == -1: continue
            att_score = score_dict[key][ID]
            for j in range(len(ID_map[ID])):
                att_dict[key].append(att_score)
    return proc_dict, att_dict


# read relevant set for queries and documents
eval_mdl = Evaluate.EvaluateModel(rel_path, False)
rel_set = eval_mdl.getAset()

# read queris and documents
qry_file = ProcDoc.readFile(qry_path)
doc_file = ProcDoc.readFile(doc_path)

# preprocess + reserve postion infomation
qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set, True, True)
doc_mdl_dict = ProcDoc.docPreproc(doc_file, True, True)

# bag of word
qry_bow_dict = ProcDoc.qryPreproc(qry_file, rel_set)
doc_bow_dict = ProcDoc.docPreproc(doc_file)

# unigram