Пример #1
0
def best_titles(claim="", edocs=edict(), best=5, model=None):
    t2phrases = find_titles_in_claim(claim, edocs)
    tscores = list()
    for title in t2phrases:
        tscores.append(
            (title, score_title(t2phrases[title], title, claim, model)))
    tscores = sorted(tscores, key=lambda x: -1 * x[1])[:best]
    return tscores
Пример #2
0
def title_edict(t2jnum={}):
    edocs = edict()
    for title in t2jnum:
        l_txt = normalize_title(title)
        if len(l_txt) > 0:
            if edocs[l_txt][0] is None:
                edocs[l_txt] = []
            edocs[l_txt][0].append(title)
    return edocs
Пример #3
0
def doc_ret(data=list(), edocs=edict(), best=5, model=None):
    '''
    对每个 claim 返回 best 个得分最高的文档标题
    '''
    docs = dict()
    for example in tqdm(data):
        tscores = best_titles(example['claim'], edocs, best, model)
        docs[example['id']] = tscores
    return docs
Пример #4
0
def doc_ir(data=list(), edocs=edict(), best=5, model=None):
    """
    Returns a dictionary of n best document titles for each claim.
    """
    docs = dict()
    # data = data[0] # This line for small dataset only
    for example in tqdm(data):
        tscores = best_titles(example["claim"], edocs, best, model)
        docs[example["id"]] = tscores
    return docs
Пример #5
0
def find_titles_in_claim(claim="", edocs=edict()):
    find = pdict(edocs)
    docset = {}
    ctoks = word_tokenize(claim)
    for word in ctoks:
        for dlist, phrase, start in find[word]:
            for d in dlist:
                if d not in docset:
                    docset[d] = []
                docset[d].append((phrase, start))
    return docset
Пример #6
0
def title_edict(t2jnum={}):
    '''
    建立文档标题的字典
    '''
    edocs = edict()
    for title in t2jnum:
        _title = normalize_title(title)
        if len(_title) > 0:
            if edocs[_title][0] is None:
                edocs[_title] = []
            edocs[_title][0].append(title)
    return edocs
Пример #7
0
def find_titles_in_claim(claim='', edocs=edict()):
    '''
    在 claim 中寻找标题短语及其在 claim 中的位置
    '''
    find = pdict(edocs)
    docset = {}
    # 对 claim 进行分词
    ctoks = word_tokenize(claim)

    for word in ctoks:
        for dlist, phrase, start in find[word]:
            for d in dlist:
                if d not in docset:
                    docset[d] = []
                docset[d].append((phrase, start))
    return docset
Пример #8
0
def best_titles(claim='', edocs=edict(), best=5, model=None):
    '''
    计算在 claim 中出现的得分最高(前 best 个)的文档
    '''
    # 在 claim 中寻找文档标题和相关句子
    t2phrases = find_titles_in_claim(claim, edocs)
    tscores = list()

    # 对每个标题打分
    for title in t2phrases:
        tscores.append(
            (title, score_title(t2phrases[title], title, claim, model)))

    # 对得分进行排序并取前 best 个
    tscores = sorted(tscores, key=lambda x: -1 * x[1])[:best]

    return tscores
Пример #9
0
def doc_ir(data=list(), edocs=edict(), best=5, model=None):
    """
    Returns a dictionary of n best document titles for each claim.
    """
    rdocs = dict()
    for example in tqdm(data):
        claim = example["claim"]
        titles = find_titles_in_claim(claim, edocs)
        ctoks = word_tokenize(claim.lower())
        rdocs[example["id"]] = (titles, ctoks)
    t2tf = titles_to_tf()
    doctf = load_doc_tf(rdocs, t2tf)
    docs = dict()
    for example in tqdm(data):
        titles, ctoks = rdocs[example["id"]]
        tscores = best_titles(example["claim"], ctoks, titles, doctf, best,
                              model)
        docs[example["id"]] = tscores
    return docs