def filter_dp_triplets(filenames,i,files):
    # Filter DP triple based on vocab
    # DP Dict to Triplet
    
    # print(start,end)
    for f in files:
        relation = []
        final_triplet = []
        triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder)
        # Find H R T
        c=0
        for sent in triplet_data:
            print(c)
            (H, HPOS), R, (T, TPOS) = sent
            H = H.lower()
            R = R.lower()
            T = T.lower()
            if R not in relation and R!="":
                relation.append(R)
            if H not in vocab or T not in vocab:
                #         print(H,R,T,"0")
                continue
            else:
                #         print(H,R,T,"1")
                final_triplet.append((H, R, T))
            c += 1
        print(f)
        F.save_to_file("Filtered_DP/"+filenames.dp_triplet_file+"_"+f, final_triplet, filenames.output_folder)
        F.save_to_file("Relations_DP/"+filenames.dp_relation_file+"_"+f, relation, filenames.output_folder)
def getVocabulary(words,less,filenames):
    import operator
    word_lower=[]
    import os
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.lower_words_file_name)):
        words_lower=[ w.lower() for w in words]  #lower
        F.save_to_file(filenames.lower_words_file_name, words_lower, filenames.output_folder)
    else:
        print("Words File Found")
        words_lower = F.load_to_file(filenames.lower_words_file_name,filenames.output_folder)

    print("Lower words count",len(words_lower))
    #remove less occuring
    d=Counter(words_lower)
    v=list(d.keys())

    #Write All word in sorted Order with their count
    f=open(filenames.output_folder+'/count_of_all_words.csv','w')
    # data_temp=sorted(d.items(),key=operator.getitem(1))
    for k in d:
        f.write(str(k)+"\t"+str(d[k])+"\n")
    f.close()    
    
    
    for k in v:
        if d[k]<less:
            del d[k]
    vocab=list(d.keys())
    print("Removing less",str(less),len(vocab))
    vocab=[w for w in vocab if not re.match( r'.*[0-9]+.*', w)]
    print("Removing Numbers",len(vocab))
    vocab=[w for w in vocab if not re.match( r'.*[:;,_`=!@#$%^&*()/<>"\'\?\\\+\-\{\}\[\]\|\.]+.*', w)]
    print("Removing Special",len(vocab))

    #Write filtered word in sorted Order with their count
    f=open(filenames.output_folder+'/count_of_filtered_words_'+str(less)+'.csv','w')
    # data_temp=sorted(d.items(),key=operator.getitem(1))
    for k in d:
        if k in vocab:
            f.write(str(k)+"\t"+str(d[k])+"\n")
    f.close()  
    updated_words=[]
    
    vocab_dict={}
    for v in vocab:
        vocab_dict[v]=""

    #Update Word to their ID for Co-Occureneces
    i=0
    for w in words_lower:
        print(i)
        if w in vocab_dict:
            updated_words.append(w)
        else:
            updated_words.append('UKN')
        i += 1
    vocab.append('UKN')
    print(len(updated_words))
    return updated_words,vocab
示例#3
0
def train(func, name, epoch, start, one, folder, learn=0.001):
    embedding_dim = 100
    vocab_dim = len(index_to_word)
    relation_dim = len(index_to_relation)
    if one == True:
        net = NetOne(embedding_dim, vocab_dim, relation_dim, func)
    else:
        net = Net(embedding_dim, vocab_dim, relation_dim, func)
    if os.path.isfile(F.folder + folder + 'training_t' + name + str(start) +
                      '.pt') and start > 0:
        print("Loaded", start, one, name)
        net.load_state_dict(
            torch.load(F.folder + folder + 'training_t' + name + str(start) +
                       '.pt'))
    else:
        net.apply(weight_init)
    optimizer = optim.SGD(net.parameters(), lr=learn)
    MRL = nn.MarginRankingLoss(margin=1, size_average=False)
    it = 0
    loss_epoch = []
    if (start > 0):
        start += 1
    for i in range(start, epoch):
        dt = F.datetime.now()
        time_t = F.datetime.strftime(dt, "%x %X")
        print(time_t)
        loss_array = []
        for m in positive_table:
            x, x_, t = getBatch(m)
            out_p, out_n = net.forward(x, x_)
            target = Variable(torch.ones(1, t))
            loss = MRL.forward(out_p, out_n, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if it % 2000 == 0:
                print("Batch Loss", m, loss.data.numpy() / t)
            loss_array.append(loss.data.numpy() / t)
            it += 1
        print("Epoch Mean " + str(i) + "==+++++++" +
              str(np.array(loss_array).mean()) + "+++++++==")
        loss_epoch.append(np.array(loss_array).mean())
        F.save_to_file(folder + 'loss_' + name + str(i), loss_array)
        torch.save(net.state_dict(),
                   F.folder + folder + 'training_t' + name + str(i) + '.pt')
        if (i - start > 2):
            if loss_epoch[-1] < 0.1 and loss_epoch[-2] < 0.1:
                break
    plt.plot(range(len(loss_epoch)), loss_epoch)
    plt.show()
    F.save_to_file(folder + 'loss_mean' + name, loss_epoch)
def combine_dp_triplets(filenames):
    files = os.listdir(filenames.output_folder + "/Filtered_DP")
    all_triplets=[]
    c=0
    for f in files:
        triplet_data = F.load_to_file("Filtered_DP/" + f, filenames.output_folder)
        all_triplets += triplet_data
        # if c>5:
        #     break
        print(c)
        c += 1
    F.save_to_file('all_dp_triplet',all_triplets,filenames.output_folder)    
    all_triplets=[]
    return
def combine_dp_relations(filenames):
    files = os.listdir(filenames.output_folder + "/Relations_DP")
    all_triplets=[]
    c=0
    for f in files:
        triplet_data = F.load_to_file("Relations_DP/" + f, filenames.output_folder)
        all_triplets += triplet_data
        # if c>5:
        #     break
        print(c)
        c += 1
    all_triplets=list(set(all_triplets))
    print(all_triplets)
    F.save_to_file(filenames.dp_relation_file,all_triplets,filenames.output_folder)    
    
    return
示例#6
0
def find_co_occurences(filenames):
    # Occurence

    os.system("mkdir -p " + filenames.output_folder + "/occurences")

    data = F.load_to_file(filenames.updated_words_file_name,
                          filenames.output_folder)
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    print(word_to_index)
    print(index_to_word)
    print(len(vocab), len(data))

    data_index = [word_to_index[w] for w in data]
    unknown_id = word_to_index['UKN']
    occurrence = {}
    window = 2
    print("Words:", len(data_index))
    for i in range(-window, window + 1):
        occurrence[i] = []

    for c in range(len(data_index)):
        # print(c)
        start = max(0, c - window)
        end = min(len(data_index) - 1, c + window)
        #     print(start,end)
        if data_index[c] != unknown_id:
            for j in range(start, end + 1):
                if c != j and data_index[j] != 0:
                    #                 print(j,c)
                    occurrence[j - c].append((data_index[c], data_index[j]))
        # if(c%10000000==9999999):
        if (c % 10000000 == 9999999):
            F.save_to_file(
                "occurences/" + filenames.updated_words_file_name +
                str((c / 10000000) + 1), occurrence, filenames.output_folder)
            for i in range(-window, window + 1):
                occurrence[i] = []

    if len(data_index) <= 10000000:
        F.save_to_file(
            "occurences/" + filenames.updated_words_file_name +
            str(len(data_index)), occurrence, filenames.output_folder)

    for k in occurrence:
        print(k, len(occurrence[k]))
示例#7
0
def find_dp_triplets(filenames):
    # Filter DP triple based on vocab
    # DP Dict to Triplet
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)

    os.listdir(filenames.output_folder + "/dp_data_pos")
    files = os.listdir(filenames.output_folder + "/dp_data_pos")

    relation = []
    final_triplet = []
    for f in files:
        triplet_data = F.load_to_file("dp_data_pos/" + f,
                                      filenames.output_folder)
        # print(triplet_data)
        # Find H R T
        for sent in triplet_data:
            # for t in sent:
            if True:
                (H, HPOS), R, (T, TPOS) = sent
                H = H.lower()
                R = R.lower()
                T = T.lower()
                if R not in relation and R != "":
                    relation.append(R)
                if H not in vocab or T not in vocab:
                    #         print(H,R,T,"0")
                    continue
                else:
                    #         print(H,R,T,"1")
                    final_triplet.append((H, R, T))

    print(len(final_triplet), len(relation))
    print(final_triplet)
    F.save_to_file(filenames.dp_triplet_file, final_triplet,
                   filenames.output_folder)
    F.save_to_file(filenames.dp_relation_file, relation,
                   filenames.output_folder)

    print(relation)
示例#8
0
def find_wn_relations(filenames):
    # Wordnet Relation
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    stop = stopwords.words('english')
    d = {}
    count = 1
    for w1 in vocab:
        print(count)
        countj = 0
        d[w1] = {}
        if w1 not in stop and len(w1) > 2:
            for w2 in vocab:
                countj += 1
                if w1 != w2 and w2 not in stop and len(w2) > 2:
                    rel = get_relation(w1, w2)
                    if len(rel) > 0:
                        d[w1][w2] = rel
            print(count, countj)
            count += 1

    F.save_to_file(filenames.wordnet_triplet_file, d, filenames.output_folder)
    a = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder)
    print(a)
def combine_all_triplets(filenames):
    # Positive and NUM

    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder)
    # dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder)
    # wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder)
    os.listdir(filenames.output_folder + "/occurences")
    files = os.listdir(filenames.output_folder + "/occurences")
    occ = {}
    # flag = 1
    # for f in files:
    #     print(f)
    #     if flag:
    #         occ = F.load_to_file("occurences/" + f, filenames.output_folder)
    #         flag = 0
    #     else:
    #         temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder)
    #         for k in occ:
    #             occ[k] += temp_occ[k]
    wordnet_relation = ['antonym','synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak']
    # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum']
    occ=[0,1,2,-1,-2]
    # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder)
    print("DP rel: ",dp_relation)
    print("WN rel: ",wordnet_relation)
    print("OC rel: ",list(occ.keys()))

    relations = dp_relation + wordnet_relation + occ_relation
    print(relations)
    relation_to_index = {}
    index_to_relation = {}
    for k, v in enumerate(relations):
        relation_to_index[v] = k
        index_to_relation[k] = v
    F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder)

    # relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder)
    # index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder)

    # print(relation_to_index)
    # print(index_to_relation)

    # dp_number_triple = []
    # dp_relation_num = [relation_to_index[r] for r in dp_relation]
    # count = 0
    # for dp_triple in dp_triplet:
    #     try:
    #         a, b, c = dp_triple
    #         a = word_to_index[a]
    #         b = relation_to_index[b]
    #         c = word_to_index[c]
    #         dp_number_triple.append((a, b, c))
    #     except:
    #         print(c)
    #         count += 1
    # len(dp_number_triple)

    # wn_number_triple = []
    # wn_relation_num = [relation_to_index[r] for r in wordnet_relation]
    # for w1 in wordnet_triplet:
    #     for w2 in wordnet_triplet[w1]:
    #         a = word_to_index[w1]
    #         b = word_to_index[w2]
    #         for c in wordnet_triplet[w1][w2]:
    #             c = relation_to_index[c]
    #             wn_number_triple.append((a, c, b))
    # len(wn_number_triple)

    # # All
    # occ_number_triple = []
    # occ_relation_num = [relation_to_index[r] for r in list(occ.keys())]
    # for r in occ:
    #     c = relation_to_index[r]
    #     for a, b in occ[r]:
    #         occ_number_triple.append((a, c, b))
    # len(occ_number_triple)

    # # without duplicates
    # occ_number_triple_without_duplicate = {}
    # occ_relation_num_without_duplicate = [relation_to_index[r] for r in list(occ.keys())]
    # for r in occ:
    #     if r < 10 and r > -10:
    #         c = relation_to_index[r]
    #         print(r, c)
    #         l = 0;
    #         for a, b in occ[r]:
    #             #         if (a,c,b) not in occ_number_triple_without_duplicate:
    #             occ_number_triple_without_duplicate[(a, c, b)] = 1
    #         print(len(occ_number_triple_without_duplicate) - l)
    # print(list(occ_number_triple_without_duplicate.keys())[:10])
    # print(len(list(occ_number_triple_without_duplicate.keys())))
    # occ_number_triple_without_dup = list(occ_number_triple_without_duplicate.keys())

    # F.save_to_file(filenames.all_relations, relations, filenames.output_folder)
    # print(len(relations))
    # print(len(wn_number_triple))
    # print(len(dp_number_triple))
    # print(len(occ_number_triple))
    # print(len(occ_number_triple_without_duplicate))

    # print(index_to_relation)

    # F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder)

    # print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple))

    # positive_table = {}
    # total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup
    # for triple in total_triple:
    #     a, b, c = triple
    #     if a not in positive_table:
    #         positive_table[a] = {}
    #     if b not in positive_table[a]:
    #         positive_table[a][b] = [c]
    #     else:
    #         positive_table[a][b].append(c)

    # F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
def preprocessing(filenames):
    data = ""
    sentences = []
    words = []

    # Find Sentences and save to file
    data = F.readData(filenames.corpus_name)
    import os
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)):
        sentences = F.getSentences(data)
        F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder)
    else:
        print("Sentences File Found")
        sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder)
    
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name))    :
        words = F.getWords(sentences)
        F.save_to_file(filenames.words_file_name, words, filenames.output_folder)
    else:
        print("Words File Found")
        words = F.load_to_file(filenames.words_file_name,filenames.output_folder)
    
    # Find Sentences and save to file
    
    print("Length of text data: ",len(data))

    # updated_words, vocab = F.getVocabulary(words, 400,filenames)
    # updated_words, vocab = F.getVocabulary(words, 300,filenames)
    # updated_words, vocab = F.getVocabulary(words, 200,filenames)
    # updated_words, vocab = F.getVocabulary(words, 100,filenames)
    # updated_words, vocab = F.getVocabulary(words, 75,filenames)
    # updated_words, vocab = F.getVocabulary(words, 50,filenames)
    # updated_words, vocab = F.getVocabulary(words, 25,filenames)
    # updated_words, vocab = F.getVocabulary(words, 20,filenames)
    # updated_words, vocab = F.getVocabulary(words, 15,filenames)
    updated_words, vocab = F.getVocabulary(words, 10,filenames)
    # updated_words, vocab = F.getVocabulary(words, 5,filenames)
    # updated_words, vocab = F.getVocabulary(words, 4,filenames)
    # updated_words, vocab = F.getVocabulary(words, 3,filenames)
    # updated_words, vocab = F.getVocabulary(words, 2,filenames)
    # updated_words, vocab = F.getVocabulary(words, 1,filenames)
    # updated_words, vocab = F.getVocabulary(words, 0,filenames)

    F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder)
    F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder)

    word_to_index = {}
    index_to_word = {}
    for k, v in enumerate(vocab):
        word_to_index[v] = k
        index_to_word[k] = v

    F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder)
    print(len(sentences), len(words))
示例#11
0
while url:

    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    opinions = soup.select('li.js_product-review')

    for opinion in opinions:
        features = {
            key: extract_feature(opinion, *args)
            for key, args in selectors.items()
        }

        features['opinion_id'] = int(opinion['data-entry-id'].strip())
        features['stars'] = float(features['stars'].split('/')[0].replace(
            ',', '.'))
        features['useful'] = int(features['useful'])
        features['useless'] = int(features['useless'])
        features['content'] = clean_string(features['content'], '\n', '\r')
        features['pros'] = clean_string(features['pros'], '\n', '\r')
        features['cons'] = clean_string(features['cons'], '\n', '\r')

        all_opinions.append(Opinion(**features))

    try:
        url = url_host + soup.select('a.pagination__next').pop()['href']
    except IndexError:
        url = None

save_to_file(all_opinions, product_id)
示例#12
0
def combine_all_triplets(filenames):
    # Positive and NUM

    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file,
                                 filenames.output_folder)
    dp_triplet = F.load_to_file(filenames.dp_triplet_file,
                                filenames.output_folder)
    wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file,
                                     filenames.output_folder)
    os.listdir(filenames.output_folder + "/occurences")
    files = os.listdir(filenames.output_folder + "/occurences")
    occ = {}
    flag = 1
    for f in files:
        print(f)
        if flag:
            occ = F.load_to_file("occurences/" + f, filenames.output_folder)
            flag = 0
        else:
            temp_occ = F.load_to_file("occurences/" + f,
                                      filenames.output_folder)
            for k in occ:
                occ[k] += temp_occ[k]
    wordnet_relation = [
        'antonym', 'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak'
    ]
    # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum']

    # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file,
                                 filenames.output_folder)
    print("DP rel: ", dp_relation)
    print("WN rel: ", wordnet_relation)
    print("OC rel: ", list(occ.keys()))

    relations = dp_relation + wordnet_relation + list(occ.keys())
    relation_to_index = {}
    index_to_relation = {}
    for k, v in enumerate(relations):
        relation_to_index[v] = k
        index_to_relation[k] = v
    F.save_to_file(filenames.r2i_file, relation_to_index,
                   filenames.output_folder)
    F.save_to_file(filenames.i2r_file, index_to_relation,
                   filenames.output_folder)

    relation_to_index = F.load_to_file(filenames.r2i_file,
                                       filenames.output_folder)
    index_to_relation = F.load_to_file(filenames.i2r_file,
                                       filenames.output_folder)

    print(relation_to_index)
    print(index_to_relation)

    dp_number_triple = []
    dp_relation_num = [relation_to_index[r] for r in dp_relation]
    count = 0
    for dp_triple in dp_triplet:
        try:
            a, b, c = dp_triple
            a = word_to_index[a]
            b = relation_to_index[b]
            c = word_to_index[c]
            dp_number_triple.append((a, b, c))
        except:
            print(c)
            count += 1
    len(dp_number_triple)

    wn_number_triple = []
    wn_relation_num = [relation_to_index[r] for r in wordnet_relation]
    for w1 in wordnet_triplet:
        for w2 in wordnet_triplet[w1]:
            a = word_to_index[w1]
            b = word_to_index[w2]
            for c in wordnet_triplet[w1][w2]:
                c = relation_to_index[c]
                wn_number_triple.append((a, c, b))
    len(wn_number_triple)

    # All
    occ_number_triple = []
    occ_relation_num = [relation_to_index[r] for r in list(occ.keys())]
    for r in occ:
        c = relation_to_index[r]
        for a, b in occ[r]:
            occ_number_triple.append((a, c, b))
    len(occ_number_triple)

    # without duplicates
    occ_number_triple_without_duplicate = {}
    occ_relation_num_without_duplicate = [
        relation_to_index[r] for r in list(occ.keys())
    ]
    for r in occ:
        if r < 10 and r > -10:
            c = relation_to_index[r]
            print(r, c)
            l = 0
            for a, b in occ[r]:
                #         if (a,c,b) not in occ_number_triple_without_duplicate:
                occ_number_triple_without_duplicate[(a, c, b)] = 1
            print(len(occ_number_triple_without_duplicate) - l)
    print(list(occ_number_triple_without_duplicate.keys())[:10])
    print(len(list(occ_number_triple_without_duplicate.keys())))
    occ_number_triple_without_dup = list(
        occ_number_triple_without_duplicate.keys())

    F.save_to_file(filenames.all_relations, relations, filenames.output_folder)
    print(len(relations))
    print(len(wn_number_triple))
    print(len(dp_number_triple))
    print(len(occ_number_triple))
    print(len(occ_number_triple_without_duplicate))

    print(index_to_relation)

    F.save_to_file(filenames.wn_num_file, wn_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.occ_num_file, occ_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.dp_num_file, dp_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup,
                   filenames.output_folder)

    print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple))

    positive_table = {}
    total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup
    for triple in total_triple:
        a, b, c = triple
        if a not in positive_table:
            positive_table[a] = {}
        if b not in positive_table[a]:
            positive_table[a][b] = [c]
        else:
            positive_table[a][b].append(c)

    F.save_to_file(filenames.positive_table_file, positive_table,
                   filenames.output_folder)
示例#13
0
words_file_name = 'words'
updated_words_file_name = 'updated_words'
vocab_file = 'vocab'
w2i_file = 'word_to_index'
i2w_file = 'index_to_word'

corpus_name = '../Data/reviews.txt'

data = ""
sentences = []
words = []
if 's' not in F.sys.argv:
    print("A")
    data = F.readData(corpus_name)
    sentences = F.getSentences(data)
    F.save_to_file(sents_file_name, sentences)
else:
    print("B")
    sentences = F.load_to_file(sents_file_name)

if 'w' not in F.sys.argv:
    print("C")
    words = F.getWords(sentences)
    F.save_to_file(words_file_name, words)
else:
    print("D")
    words = F.load_to_file(words_file_name)

updated_words, vocab = F.getVocabulary(words, 400)
F.save_to_file(vocab_file, vocab)
F.save_to_file(updated_words_file_name, updated_words)
示例#14
0
for i in range(-window, window + 1):
    occurrence[i] = []

for c in range(len(data_index)):
    print(c)
    start = max(0, c - window)
    end = min(len(data_index) - 1, c + window)
    #     print(start,end)
    if data_index[c] != unknown_id:
        for j in range(start, end + 1):
            if c != j and data_index[j] != 0:
                #                 print(j,c)
                occurrence[j - c].append((data_index[c], data_index[j]))
    # if(c%10000000==9999999):
    if (c % 10000000 == 9999999):
        F.save_to_file(
            "occurences/" + occurrence_data_file + str((c / 10000000) + 1),
            occurrence)
        for i in range(-window, window + 1):
            occurrence[i] = []

for k in occurrence:
    print(k, len(occurrence[k]))

# data=F.load_to_file(occurrence_data_file)

# In[ ]:

dt = F.datetime.now()
time_t = F.datetime.strftime(dt, "%x %X")
print("END", time_t)
示例#15
0
def preprocessing(filenames):
    data = ""
    sentences = []
    words = []
    # if 's' not in F.sys.argv:
    # 	print("A")
    # Find Sentences and save to file
    data = F.readData(filenames.corpus_name)
    sentences = F.getSentences(data)
    F.save_to_file(filenames.sents_file_name, sentences,
                   filenames.output_folder)
    # else:
    # 	print("B")
    # 	sentences=F.load_to_file(filenames.sents_file_name)

    # if 'w' not in F.sys.argv:
    print("C")
    # Find Sentences and save to file
    words = F.getWords(sentences)
    F.save_to_file(filenames.words_file_name, words, filenames.output_folder)
    # else:
    # 	print("D")
    # 	words=F.load_to_file(filenames.words_file_name)

    updated_words, vocab = F.getVocabulary(words, 400)
    F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder)
    F.save_to_file(filenames.updated_words_file_name, updated_words,
                   filenames.output_folder)

    word_to_index = {}
    index_to_word = {}
    for k, v in enumerate(vocab):
        word_to_index[v] = k
        index_to_word[k] = v
    F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder)

    print(len(sentences), len(words))
示例#16
0
    d[w1]={}
    if w1 not in stop and len(w1)>2:
        for w2 in vocab:
            countj += 1
            if w1!=w2 and w2 not in stop and len(w2)>2:
                rel=get_relation(w1,w2)
                if len(rel)>0:   
                    d[w1][w2]=rel
        print(count,countj)
        count += 1


# In[9]:


F.save_to_file(wordnet_realtion_file,d)


# In[10]:


a=F.load_to_file(wordnet_realtion_file)


# In[11]:


print(a)


# In[ ]:
示例#17
0
    'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak'
]
dp_relation = [
    'advmod', 'amod', 'appos', 'compound', 'conj', 'fixed', 'flat', 'doeswith',
    'list', 'nmod', 'nummod', 'orphan', 'reparandum'
]

# In[7]:

relations = dp_relation + wordnet_relation + list(occ.keys())
relation_to_index = {}
index_to_relation = {}
for k, v in enumerate(relations):
    relation_to_index[v] = k
    index_to_relation[k] = v
F.save_to_file(relation_to_index_file, relation_to_index)
F.save_to_file(index_to_relation_file, index_to_relation)

# In[8]:

relation_to_index = F.load_to_file(relation_to_index_file)
index_to_relation = F.load_to_file(index_to_relation_file)

# In[5]:

print(relation_to_index)
print(index_to_relation)

# In[9]:

dp_number_triple = []
示例#18
0
files=os.listdir(F.folder+"dp_data_pos")

relation=[]
final_triplet=[]
for f in files:
    triplet_data=F.load_to_file("dp_data_pos/"+f)
    # print(triplet_data)
    #Find H R T
    for sent in triplet_data:
        for t in sent:
            (H,HPOS),R,(T,TPOS)=t
            if R not in relation:
                relation.append(R)
            if H not in vocab or T not in vocab:
        #         print(H,R,T,"0")
                continue
            else:
        #         print(H,R,T,"1")
                final_triplet.append((H,R,T))


print(len(final_triplet),len(relation))
F.save_to_file(final_triplet_file,final_triplet)
F.save_to_file(dp_relation_file,relation)

print(relation)


dt=F.datetime.now()
time_t=F.datetime.strftime(dt,"%x %X")
print("END",time_t)
示例#19
0
    fun.plot_MSE_train_test(polydegree,
                            cv_error_train_opt[:, 0],
                            cv_error_test_opt[:, 0],
                            '%s, $N$=%d, $K$=%d, noise=%.2f' %
                            (reg_str, N, K, noise),
                            'train_test_%s' % save_cv,
                            fig_path,
                            run_mode,
                            resample='CV',
                            xlim=xlim,
                            ylim=ylim)

    # Write bootstrap to file
    fun.save_to_file(
        [bs_error_test_opt[:, 0], bs_bias_opt[:, 0], bs_var_opt[:, 0]],
        ['bs_error_test', 'bs_bias', 'bs_var'],
        write_path + 'franke/bias_var_task_%s_%s.txt' % (run_mode, save_bs),
        benchmark)

    # Write CV to file
    fun.save_to_file([cv_error_test_opt[:, 0], cv_error_train[:, 0]],
                     ['cv_error_test', 'cv_error_train'], write_path +
                     'franke/train_test_task_%s_%s.txt' % (run_mode, save_cv),
                     benchmark)

    plt.show()

########################################################################################################################
if run_mode == 'c':
    # Performs Cross-Validation with OLS
    K = 5