def create_dictory():
    """
    This function is the summary of data pre-processing in Subtask2.
    """
    dictory = {}
    path = PATH + 'data/wiki-pages/wiki-pages/'
    files = os.listdir(path)
    D = 0
    documents = []
    for f in files:
        data = load_dataset_json(os.path.join(path, f))
        documents += data
        for d in data:
            D += 1
            text = list(set(d['text'].split(' ')))[1:]
            for t in text:
                t = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", t)
                if t.isdigit():
                    continue
                if not t in dictory:
                    dictory[t] = [d['id']]
                else:
                    dictory[t].append(d['id'])
    print('complete')
    return dictory, documents, D
def Subtask8_pre_2():
    """
    The output is two document. The 'pos.txt' including the claim and evidence sentence. The 'neg.txt' including the negative sample.
    """
    train_data = load_dataset_json(PATH + 'train.jsonl')

    with open(PATH + 'Subtask8_pre_1.txt', encoding='utf-8') as f:
        document = eval(f.read())

    pos = open(PATH + 'pos.txt', 'w', encoding='utf8')
    neg = open(PATH + 'neg.txt', 'w', encoding='utf8')
    for data in train_data:
        if data['label'] != 'NOT ENOUGH INFO':
            claim = data['claim'][:-1].lower()
            fp = pos if data['label'] == 'SUPPORTS' else neg
            fp.write(claim + '\n')
            for evidence in data['evidence']:
                # print(evidence)
                if evidence[0][2]:
                    tmp = evidence[0][2]
                    if tmp in document:
                        # print(document[tmp])
                        line = document[tmp].split('\n')[
                            evidence[0][3]].replace(
                                str(evidence[0][3]) + '\t', '')
                        # print(line)
                        fp.write(line + '\n')
示例#3
0
def Subtask4_pre_train_2_Laplace(number):
    """
    This function is the same as the function for Laplace Smoothing query-likelihood unigram language model in Subtask3.
    """
    alpha = 0.5

    train_data = load_dataset_json(PATH + 'data/train.jsonl',
                                   instance_num=number)
    data = np.load(PATH + 'pre_train_1_Subtask4.npy', allow_pickle=True).item()

    id_list = []
    for d in train_data:
        if d['label'] != 'NOT ENOUGH INFO':
            claim_id = d['id']
            id_list.append(claim_id)

    documents = {}
    for x in id_list:
        # claim = None
        for d in train_data:
            if d['id'] == x:
                claim = d['claim'][:-1]
                claim = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", claim)
                claim = claim.split(' ')
                # break

                C = sum(data.values())
                f = []

                files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
                for i in files:
                    with open(
                            os.path.join(PATH + 'data/wiki-pages/wiki-pages/',
                                         i)) as fp:
                        lines = fp.readlines()
                        for line in lines:
                            text = eval(line)['text'].split(' ')
                            tmp = 0
                            for w in claim:
                                if w in text:
                                    p = (text.count(w) + 1) / (len(text) + 1)
                                else:
                                    if w in data:
                                        p = alpha * (data[w] + 1) / (C +
                                                                     len(data))
                                    else:
                                        p = 0.01
                                tmp += log(p)
                            f.append((eval(line)['id'], tmp))
                f.sort(key=lambda x: x[1], reverse=True)
                evidence = []
                for i in range(5):
                    name = f[i][0]
                    evidence.append(name)
                documents[d['claim']] = evidence
    np.save(PATH + "pre_train_2_Subtask4.npy", documents)
    print('save complete')
def calculate_doc(claim_id, dictory, documents, D):
    """
    This function is the the optimized cosine similarly function in Subtask2
    The output of the function is dictionary which the key is the document id of the 5 most similar documents of the claim and the value is the document 'line' of each 'id'.
    """
    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)

    claim = None
    for d in train_data:
        if d['id'] == claim_id:
            d['claim'] = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", d['claim'])
            d['claim'] = d['claim']
            claim = d['claim'].split(' ')
            break
    # print(d['id'] , d['claim'])

    keys = []
    for c in claim:
        tf = claim.count(c) / len(claim)
        idf = log((D / (1 + (len(dictory[c]) if c in dictory else 0))))
        keys.append(
            (c, tf * idf, idf, tf, (len(dictory[c]) if c in dictory else 0)))
    keys.sort(key=lambda x: x[1], reverse=True)
    keys = keys[:5]
    vec1 = [k[1] for k in keys]
    # print(keys)

    document_tfidf = []
    for d in documents:
        text = d['text'].split(' ')
        vec2 = []
        for k in keys:
            tf = text.count(k[0]) / len(text)
            idf = k[2]
            vec2.append(tf * idf)
        sim = cosine_similarity(vec1, vec2)
        document_tfidf.append([d['id'], sim])
    document_tfidf.sort(key=lambda x: x[1], reverse=True)

    evidence = []
    for i in range(5):
        name = document_tfidf[i][0]
        evidence.append(name)

    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    docu = {}
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                line = eval(line)
                if line['id'] in evidence:
                    text = line['lines']
                    docu[line['id']] = text
    return docu
示例#5
0
def Subtask4_pre_dev_4():
    evidence_data = load_dataset_json(PATH + 'data/dev.jsonl', instance_num=20)
    evi = []
    for i in evidence_data:
        for j in i['evidence']:
            evi.append([i['claim'], j[0][2], j[0][3]])
    evi_new = []
    for e in evi:
        if e not in evi_new:
            evi_new.append(e)

    with open(PATH + 'pre_dev_4_Subtask4.txt', 'w', encoding='utf-8') as f:
        f.write(str(evi_new))
示例#6
0
def Subtask4_pre_dev_2_Laplace(index):
    alpha = 0.5

    dev_data = load_dataset_json(PATH + 'data/dev.jsonl', instance_num=20)
    data = np.load(PATH + 'pre_train_1_Subtask4.npy', allow_pickle=True).item()

    documents = {}
    index_list = index
    for x in index_list:
        # claim = None
        for d in dev_data:
            if d['id'] == x:
                claim = d['claim'][:-1]
                claim = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", claim)
                claim = claim.split(' ')
                # break

                C = sum(data.values())
                f = []

                files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
                for i in files:
                    with open(
                            os.path.join(PATH + 'data/wiki-pages/wiki-pages/',
                                         i)) as fp:
                        lines = fp.readlines()
                        for line in lines:
                            text = eval(line)['text'].split(' ')
                            tmp = 0
                            for w in claim:
                                if w in text:
                                    p = (text.count(w) + 1) / (len(text) + 1)
                                else:
                                    if w in data:
                                        p = alpha * (data[w] + 1) / (C +
                                                                     len(data))
                                    else:
                                        p = 0.01
                                tmp += log(p)
                            f.append((eval(line)['id'], tmp))
                f.sort(key=lambda x: x[1], reverse=True)
                evidence = []
                for i in range(5):
                    name = f[i][0]
                    evidence.append(name)
                documents[d['claim']] = evidence
    np.save(PATH + "pre_dev_2_Subtask4.npy", documents)
    print('save complete')
示例#7
0
def Subtask3_0(claim_id):
    """
    The input is the claim id.
    The output is the claim, the 5 most similar documents and the query-likelihood unigram language model value.
    The out putis save in the 'Q3_unigram.csv'.
    In the query likelihood unigram language model, I do some smoothing to improve the result.
    For the terms in claim which do not appear in the document, the probability is not 0 but the probability it appear in the wiki-pages or p = 0.01 for the term even not appear in wiki-pages.
    """
    alpha = 0.5

    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)

    claim = None
    for d in train_data:
        if d['id'] == claim_id:
            claim = d['claim'][:-1]
            claim = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", claim)
            claim = claim.split(' ')
            break
    print(d['id'])
    print(d['claim'])

    data = np.load(PATH + 'n_dict_Subtask3.npy', allow_pickle=True).item()
    C = sum(data.values())

    f = []
    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                text = eval(line)['text']
                tmp = 0
                for w in claim:
                    if w in text:
                        p = text.count(w) / len(
                            text
                        )  # calculate the probability for the terms appear in the document.
                    else:
                        if w in data:
                            p = alpha * data[
                                w] / C  # calculate the probability for the terms not appear in the document by using the probability it appear in the wiki-pages.
                        else:
                            p = 0.001  # the probability of the terms do not appear in wiki-pages is 0.001.
                    tmp += log(p)  # calculate the log(p) of the claim.
                f.append((eval(line)['id'], tmp))
    f.sort(key=lambda x: x[1], reverse=True)
    return f[:5]
示例#8
0
def Subtask4_pre_train_4():
    """
    The output is a list which the structure is [claim, evidence document, sentence_number].
    """
    evidence_data = load_dataset_json(PATH + 'data/train.jsonl',
                                      instance_num=200)
    evi = []
    for i in evidence_data:
        for j in i['evidence']:
            evi.append([i['claim'], j[0][2], j[0][3]])
    evi_new = []
    for e in evi:
        if e not in evi_new:
            evi_new.append(e)

    with open(PATH + 'pre_train_4_Subtask4.txt', 'w', encoding='utf-8') as f:
        f.write(str(evi_new))
示例#9
0
def Subtask3_Dirichlet(claim_id):
    """
    The input is the claim id.
    The output is the claim, the 5 most similar documents and the Dirichlet Smoothing query-likelihood unigram language
    model value.
    The output is save in the 'Q3_dirichlet.csv'.
    """
    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)

    claim = None
    for d in train_data:
        if d['id'] == claim_id:
            claim = d['claim'][:-1]
            claim = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", claim)
            claim = claim.split(' ')
            break
    print(d['id'])
    print(d['claim'])

    data = np.load(PATH + 'n_dict_Subtask3.npy', allow_pickle=True).item()
    C = sum(data.values())
    N = 5396106
    u = C / N
    f = []

    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                text = eval(line)['text'].split(' ')
                alpha = u / (len(text) + u)
                tmp = 0
                for w in claim:
                    if w in data:
                        p = (text.count(w) + u * data[w] / C) / (
                            len(text) + u
                        )  # calculate the probability for the terms appear in document with Dirichlet.
                    else:
                        p = (text.count(w)) / (
                            len(text) + u
                        )  # calculate the probability for the terms not appear in document with Dirichlet.
                    tmp += log(p)  # calculate the log(p) of the claim.
                f.append((eval(line)['id'], tmp))
    f.sort(key=lambda x: x[1], reverse=True)
    return f[:5]
def prepare_dev_Subtask6_1():
    train_data = load_dataset_json(PATH + 'data/dev.jsonl', instance_num=500)
    evidence = []
    for i in train_data:
        for j in i['evidence']:
            evidence.append(j[0][2])
    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    documents = {}
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                line = eval(line)
                if line['id'] in evidence:
                    text = line['lines']
                    documents[line['id']] = text
    with open(PATH + 'prepare_dev_Subtask6_1.txt', 'w', encoding='utf-8') as f:
        f.write(str(documents))
示例#11
0
def Subtask2_cossim(claim_id, numberofducuments):
    """
    The input is the list of claim 'id' and the total number of documents.
    The output is the claim, the top 5 TF-IDF terms in the claim, the TF-IDF of these terms, the five most similar documents with the claim and the cosine similarity between them.
    The top 5 TF-IDF terms in the claim and the TF-IDF of these terms is save in the 'Q2_claim_TF-IDF.csv' and the claim with the five most similar documents with the claim is save in the 'Q2_vector_space.csv'.
    """
    data = np.load(PATH + 'diction_Subtask2.npy', allow_pickle=True).item()
    dictory = np.load(PATH + 'dictory_Subtask2.npy', allow_pickle=True).item()
    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)

    claim = None
    for d in train_data:
        if d['id'] == claim_id:
            d['claim'] = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", d['claim'])
            claim = d['claim'].split(' ')
            break
    print(d['id'])
    print(d['claim'])

    keys = []
    for c in claim:
        tf = claim.count(c) / len(claim)
        idf = log((numberofducuments / (1 + (len(dictory[c]) if c in dictory else 0))))
        keys.append((c, tf * idf, idf, tf))
    keys.sort(key=lambda x: x[1], reverse=True)
    keys = keys[:5]
    word = [k[0] for k in keys]
    vec1 = [k[1] for k in keys]  # vec1 is the list of tf*IDF of top 5 words in the claim.
    print(word)
    print(vec1)

    document_tfidf = []
    for d in data.items():
        text = d[1].split(' ')
        vec2 = []
        for k in keys:
            tf = text.count(k[0]) / len(text)
            idf = k[2]
            vec2.append(tf * idf)  # vec2 is the list of tf*IDF of top 5 words in the document.
        sim = cosine_similarity(vec1, vec2)
        document_tfidf.append([d[0], sim])
    document_tfidf.sort(key=lambda x: x[1], reverse=True)
    return document_tfidf[:5]
def prepare_train_Subtask6_1():
    """
    This function is the same as the function uss in the Subtask4 and the purpose of it is to get a dictionary which the key is document id and the value is the 'line'.
    The instance number we use at this project is 5000.
    """
    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num = 5000)
    evidence = []
    for i in train_data:
        for j in i['evidence']:
            evidence.append(j[0][2])
    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    documents = {}
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                line = eval(line)
                if line['id'] in evidence:
                    text = line['lines']
                    documents[line['id']] = text
    with open(PATH + 'prepare_train_Subtask6_1.txt', 'w', encoding='utf-8') as f:
        f.write(str(documents))
def Subtask8_pre_1():
    """
    The output is a dictionary which the key is the document 'id' and the value is 'lines' in wiki-pages.
    """
    train_data = load_dataset_json(PATH + 'train.jsonl')

    evidence = []
    for d in train_data.items():
        for i in range(5):
            evidence.append(d[1][i])

    files = os.listdir(PATH + 'data/wiki-pages/wiki-pages/')
    documents = {}
    for i in files:
        with open(os.path.join(PATH + 'data/wiki-pages/wiki-pages/', i)) as fp:
            lines = fp.readlines()
            for line in lines:
                line = eval(line)
                if line['id'] in evidence:
                    text = line['lines']
                    documents[line['id']] = text
    with open(PATH + 'Subtask8_pre_1.txt', 'w', encoding='utf-8') as f:
        f.write(str(documents))
def prepare_train_Subtask6_2():
    """
    This function aim to connect the claim, the evidence sentence and the label. And embedding this list.
    The output of this function is the train data for the neural network and it is a 601-dimensional vector for training
    """
    with open(PATH + 'prepare_train_Subtask6_1.txt', encoding='utf-8') as f:
        document = eval(f.read())

    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=5000)
    model = word2vec.KeyedVectors.load_word2vec_format(PATH + "data/GoogleNews-vectors-negative300.bin", binary=True)

    with open(PATH + 'traindata_Subtask6.txt', 'w') as fp:
        for data in train_data:
            if data['label'] != 'NOT ENOUGH INFO':
                claim = data['claim'][:-1]
                claim = re.sub("[-,.。:_=+*&^%$#@!?()<>/`';|]", "", claim)
                claim = claim.split(' ')
                claim = list(filter(lambda x: x in model.vocab, claim))
                Vi = []
                for i in range(len(claim)):
                    Vi.append(model[claim[i]])

                V = np.zeros(len(Vi[0]))
                for i in range(len(claim)):
                    for j in range(len(Vi[0])):
                        V[j] = V[j] + Vi[i][j]

                rms = 0
                for i in range(len(Vi[0])):
                    rms += V[i] * V[i]
                rms = np.sqrt(rms / len(Vi[0]))

                for i in range(len(Vi[0])):
                    V[i] = V[i] / rms

                label = '1' if data['label'] == 'SUPPORTS' else '0'
                # V = V.astype(str).tolist()

                for evidence in data['evidence']:
                    # print(evidence)
                    if evidence[0][2]:
                        tmp = evidence[0][2]
                        if tmp in document:
                            # print(document[tmp])
                            lines = document[tmp].split('\n')
                            # for k in range(len(lines)):
                            line = document[tmp].split('\n')[evidence[0][3]].replace(str(evidence[0][3]) + '\t', '')
                            line = re.sub('[-,.。:_=+*&^%$#@!?()<>/]', '', line)
                            line = line.split(' ')
                            line = list(filter(lambda x: x in model.vocab, line))
                            # print(line)
                            Vi = []
                            for i in range(len(line)):
                                Vi.append(model[line[i]])
                            V1 = np.zeros(len(Vi[0]))
                            for i in range(len(line)):
                                for j in range(len(Vi[0])):
                                    V1[j] = V1[j] + Vi[i][j]
                            rms = 0
                            for i in range(len(Vi[0])):
                                rms += V1[i] * V1[i]
                            rms = np.sqrt(rms / len(Vi[0]))
                            for i in range(len(Vi[0])):
                                V1[i] = V1[i] / rms
                            # res = V - V1
                            # print(type(V))
                            res1 = V.astype(str).tolist()
                            res2 = V1.astype(str).tolist()

                            fp.write(' '.join(res1) + ' ' + ' '.join(res2) + ' ' + label + '\n')