def open_file(self, mode): if mode == "train": f = "all-train" else: f = "all-test" with codecs.open("./WebQSP_Corpus/WebQSP" + f + ".txt", "r", encoding="utf-8") as f: stopwords = nltk.corpus.stopwords.words("english") for line in f: items = line[:-1].split("\t") s1 = clean_str(items[0]).split() s2 = clean_str(items[1]).split() label = int(items[2]) self.s1s.append(s1) self.s2s.append(s2) self.labels.append(label) word_cnt = len([ word for word in s1 if (word not in stopwords) and (word in s2) ]) self.features.append([len(s1), len(s2), word_cnt]) local_max_len = max(len(s1), len(s2)) if local_max_len > self.max_len: self.max_len = local_max_len self.data_size = len(self.s1s) flatten = lambda l: [item for sublist in l for item in sublist] q_vocab = list(set(flatten(self.s1s))) idf = {} for w in q_vocab: idf[w] = np.log(self.data_size / len([1 for s1 in self.s1s if w in s1])) for i in range(self.data_size): wgt_word_cnt = sum([ idf[word] for word in self.s1s[i] if (word not in stopwords) and (word in self.s2s[i]) ]) self.features[i].append(wgt_word_cnt) self.num_features = len(self.features[0])
def test_sentence(sentence): sentence = clean_str(sentence) words = list(sentence.split()) ls = [] for w in words: ls.append(vocab_dict[w]) sum_vect = 0 for index in ls: sum_vect += W[index] return sum_vect
from gensim.models import Word2Vec, Phrases import pickle from helpers import clean_str from ANN import RNN, prepare_data, embedding_format root = "C:/Users/1/James/grctc/GRCTC_Project/Classification/" write_path = root + "Sequential_Models/word2vector/" filename = root + 'Preprocessing/data/' \ '' \ 'FinalAnnotationsModality_sentences_wArtificialProhibitions.txt' # "Preprocessing/data/FinalAnnotationsModality_sentences.txt" googleVecs = "C:/Users/1/James/grctc/GRCTC_Project/Classification/Data/Embeddings/word2vec/GoogleNews-vectors-negative300.bin" file = '/annotated_data/EU.AML2015_new.txt' rest_path = "C:/Users/1/James\REST/minimal-django-file-upload-example/src/" \ "for_django_1-9/myproject/myproject/test/vectors/" sentences = [ clean_str(line.decode('utf-8').strip()).split() for line in open(filename, "r").readlines() ] legal_sentences = pickle.load(open(root + "/XMLParsers/eurolex_documents.pkl")) #X = Word2Vec(legal_sentences, size=100, window=5, min_count=5, workers=4) X = Word2Vec.load_word2vec_format(googleVecs, binary=True) # C binary format # test data prep is correct root = "C:/Users/1/James/grctc/GRCTC_Project/Classification/Word2Vec/annotated_data/" emb, y = prepare_data(filename=root + 'EU.AML2015_new.txt') #emb = embedding_format(emb) print(emb.shape) model = RNN(X=emb, y=y, h_dim=5, num_class=3, type='lstm', pad=100)
def preprocess(datafile, MIN_LENGTH=10, LIMIT=59, header=True): line_num = 150000 lines = [] max_len = 0 longest_q = "" dups = 0 sentences = 0 skipped = 0 skipped_dup = 0 count = 0 with open(datafile) as f: for line in f: #count += 1 #if count < 364000: # continue if header == True: header = False continue #print line fields = line.strip('\n').split('\t') q1 = clean_str(fields[3]) q2 = clean_str(fields[4]) dup = fields[5] q1_len = len(q1.split()) q2_len = len(q2.split()) if q1_len > LIMIT or q2_len > LIMIT: skipped += 1 if dup == '1': skipped_dup += 1 continue if q1_len + q2_len < MIN_LENGTH: skipped += 1 if dup == '1': skipped_dup += 1 continue if q1_len > max_len: max_len = q1_len longest_q = q1 if q2_len > max_len: max_length = q2_len longest_q = q2 if dup == '1': dups += 1 if len(q1) == 0: q1 = "." if len(q2) == 0: q2 = "." lines.append((q1, q2, dup)) #print fields sentences += 1 print "Longest question: %s (%d)" % (longest_q, max_len) print "duplicates: %d (%.2f)" % (dups, ((1.0 * dups) / sentences)) print "skipped: %d (%d)" % (skipped, skipped_dup) return lines