예제 #1
0
    def open_file(self, mode):
        if mode == "train":
            f = "all-train"
        else:
            f = "all-test"
        with codecs.open("./WebQSP_Corpus/WebQSP" + f + ".txt",
                         "r",
                         encoding="utf-8") as f:
            stopwords = nltk.corpus.stopwords.words("english")

            for line in f:
                items = line[:-1].split("\t")

                s1 = clean_str(items[0]).split()
                s2 = clean_str(items[1]).split()
                label = int(items[2])

                self.s1s.append(s1)
                self.s2s.append(s2)
                self.labels.append(label)
                word_cnt = len([
                    word for word in s1
                    if (word not in stopwords) and (word in s2)
                ])
                self.features.append([len(s1), len(s2), word_cnt])

                local_max_len = max(len(s1), len(s2))
                if local_max_len > self.max_len:
                    self.max_len = local_max_len

        self.data_size = len(self.s1s)

        flatten = lambda l: [item for sublist in l for item in sublist]
        q_vocab = list(set(flatten(self.s1s)))
        idf = {}
        for w in q_vocab:
            idf[w] = np.log(self.data_size /
                            len([1 for s1 in self.s1s if w in s1]))

        for i in range(self.data_size):
            wgt_word_cnt = sum([
                idf[word] for word in self.s1s[i]
                if (word not in stopwords) and (word in self.s2s[i])
            ])
            self.features[i].append(wgt_word_cnt)

        self.num_features = len(self.features[0])
def test_sentence(sentence):
    sentence = clean_str(sentence)
    words = list(sentence.split())
    ls = []
    for w in words:
        ls.append(vocab_dict[w])
    sum_vect = 0
    for index in ls:
        sum_vect += W[index]
    return sum_vect
예제 #3
0
from gensim.models import Word2Vec, Phrases
import pickle
from helpers import clean_str
from ANN import RNN, prepare_data, embedding_format

root = "C:/Users/1/James/grctc/GRCTC_Project/Classification/"
write_path = root + "Sequential_Models/word2vector/"
filename = root + 'Preprocessing/data/' \
                  '' \
                  'FinalAnnotationsModality_sentences_wArtificialProhibitions.txt'
# "Preprocessing/data/FinalAnnotationsModality_sentences.txt"

googleVecs = "C:/Users/1/James/grctc/GRCTC_Project/Classification/Data/Embeddings/word2vec/GoogleNews-vectors-negative300.bin"
file = '/annotated_data/EU.AML2015_new.txt'
rest_path = "C:/Users/1/James\REST/minimal-django-file-upload-example/src/" \
            "for_django_1-9/myproject/myproject/test/vectors/"

sentences = [
    clean_str(line.decode('utf-8').strip()).split()
    for line in open(filename, "r").readlines()
]
legal_sentences = pickle.load(open(root + "/XMLParsers/eurolex_documents.pkl"))
#X = Word2Vec(legal_sentences, size=100, window=5, min_count=5, workers=4)
X = Word2Vec.load_word2vec_format(googleVecs, binary=True)  # C binary format

# test data prep is correct
root = "C:/Users/1/James/grctc/GRCTC_Project/Classification/Word2Vec/annotated_data/"
emb, y = prepare_data(filename=root + 'EU.AML2015_new.txt')
#emb = embedding_format(emb)
print(emb.shape)
model = RNN(X=emb, y=y, h_dim=5, num_class=3, type='lstm', pad=100)
예제 #4
0
def preprocess(datafile, MIN_LENGTH=10, LIMIT=59, header=True):
    line_num = 150000
    lines = []
    max_len = 0
    longest_q = ""
    dups = 0
    sentences = 0
    skipped = 0
    skipped_dup = 0
    count = 0
    with open(datafile) as f:
        for line in f:
            #count += 1
            #if count < 364000:
            #    continue
            if header == True:
                header = False
                continue
            #print line
            fields = line.strip('\n').split('\t')

            q1 = clean_str(fields[3])
            q2 = clean_str(fields[4])
            dup = fields[5]

            q1_len = len(q1.split())
            q2_len = len(q2.split())

            if q1_len > LIMIT or q2_len > LIMIT:
                skipped += 1
                if dup == '1':
                    skipped_dup += 1
                continue

            if q1_len + q2_len < MIN_LENGTH:
                skipped += 1
                if dup == '1':
                    skipped_dup += 1
                continue

            if q1_len > max_len:
                max_len = q1_len
                longest_q = q1

            if q2_len > max_len:
                max_length = q2_len
                longest_q = q2

            if dup == '1':
                dups += 1

            if len(q1) == 0:
                q1 = "."

            if len(q2) == 0:
                q2 = "."

            lines.append((q1, q2, dup))

            #print fields
            sentences += 1

    print "Longest question: %s (%d)" % (longest_q, max_len)
    print "duplicates: %d (%.2f)" % (dups, ((1.0 * dups) / sentences))
    print "skipped: %d (%d)" % (skipped, skipped_dup)

    return lines