def tokenization(all_texts): tokenizer = Tokenizer(filters=base_filter(), lower=True, split=" ") tokenizer.fit_on_texts(all_texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) return tokenizer, word_index
def __init__(self, field_name, nb_words=None, filters=base_filter(), lower=True, split=' ', char_level=False, arr_of_text = []): self.text = "" self.field_name = field_name self.arr_of_text = arr_of_text super(token,self).__init__(nb_words, filters, lower, split, char_level) self.create_text() self.fit_on_texts([self.text])
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "): '''prune: sequence of characters to filter out ''' if lower: text = text.lower() text = text.translate(string.maketrans(filters, split*len(filters))) seq = text.split(split) sentences = [_f for _f in seq if _f] return bigram(sentences)
def convert_word_to_int(x_train, num_words, interp=True): x = [] fun = None if interp: fun = no_filter() else: fun = base_filter() for row in x_train: x.append(one_hot(row, num_words, filters=fun)) return x
def give_vocabulary(sentences_df): ''' @parameter: the dataframe from the json file with the 5 columns we need @returns: the vocabulary in a set. ''' vocabulary = [] list_of_sentences1 = sentences_df['sentence1'].tolist() list_of_sentences2 = sentences_df['sentence2'].tolist() list_sentence_words = [] ''' # Do same with keras for sentence in list_of_sentences1: sentence.lower() #tokenize or split by " " tokens1 = sentence.split(" ") for token1 in tokens1: if token1 not in vocabulary: vocabulary.append(token1) ''' list_sentence_word_tmp = [] for s1, s2 in zip(list_of_sentences1, list_of_sentences2): sentence_unicode1 = make_unicode(s1) sentence_unicode2 = make_unicode(s2) #print sentence_no_unicode list_sentence_word_tmp += text_to_word_sequence( sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ") list_sentence_word_tmp += text_to_word_sequence( sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ") set_words = set(list_sentence_word_tmp) word2idx = {} for i, word in enumerate(set_words): word2idx[word] = int(i) #print word2idx print "length of vocabulary: %d" % len(set_words) return set_words, len(set_words), word2idx
filename = "data/de-en.tgz" urllib.urlretrieve(url, filename) tar = tarfile.open(filename, "r:gz") tar.extractall("data/") tar.close() lines = [] with open("data/europarl-v7.de-en.en") as f: for line in f: line_processed = tf.compat.as_str(line).decode('utf-8').encode("utf-8") lines.append(line_processed) if len(lines) == max_lines: break return lines tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ") # tokenizer = Tokenizer(nb_words=None) lines = read_data() print('Lines:', len(lines)) X = [] Y = [] counter = Counter() for line in lines: words = line.lower().strip().split(" ") if len(words) < input_dim: continue sentence = [] y = None
def main(): scoreList = [0.0,0.0] with open('data/info.json') as j: info = ujson.load(j) for problem in os.listdir('data'): greek=False if problem.startswith('problem'): truthPath = 'data/truth/'+problem+'/clustering.json' with open(truthPath) as t: truth = ujson.load(t) print(problem) probTokList = [] docList = [] docDict = {} X=[] Y=[] path = 'data/' + problem for entry in info: if entry["folder"] == problem: lang=entry["language"] if entry["language"] == "gr": greek=True CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4)) docs = [path+'/'+x for x in os.listdir(path)] cMatrix = CV.fit_transform(docs) for doc in os.listdir(path): docTokList = [] with open(path + '/' + doc) as d: article = d.readlines() for sent in article: sentTokList = [] for word in sent.split(): for token in word: procToken = preprop(token,greek) sentTokList.append(procToken) #Every item of the list is a normalized character docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence probTokList.append(' '.join(docTokList))#Every item of the list is a document docList.append(doc) tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") tokenizer.fit_on_texts(probTokList) seqList = tokenizer.texts_to_sequences(probTokList) uniqueTokens = max([max(x) for x in seqList]) print(uniqueTokens,lang) sampling_table = sequence.make_sampling_table(uniqueTokens+1) for i,seq in enumerate(seqList): x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table) x = zip(x, y) X.append(x) #Y.extend(y) docDict[docList[i]] = seq strX=[str(x) for x in X] xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") xTokenizer.fit_on_texts(strX) #docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf") docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf") #scores = embedNN(X,Y) pairs = combinations(docDict.keys(),2) cList = [] nnDict = {} for cluster in truth: cPairs = [] if len(cluster) > 1: for item in cluster: cPairs.append(str(item["document"])) cList.extend(list(permutations(cPairs,2))) for pair in pairs: match = False if pair in cList: match = True nnDict[pair] = match for i, doc in enumerate(docMatrix): docDict[docList[i]] = doc truthCounter = Counter(nnDict.values()) baseline = 1-float(truthCounter[True])/float(len(nnDict)) print("Baseline for {} is {}".format(problem, baseline)) clusterCount = Counter() kmclusters = False # Change to False for meanshift if kmclusters: pbar = ProgressBar() for nclusters in pbar(reversed(range(len(docMatrix)-1))): #print("{} Clusters".format(nclusters+1)) clusters = KMclusterer(nclusters+1,cMatrix) for c in range(nclusters+1): #print(c,"has:",[i for i,x in enumerate(clusters) if x == c]) for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 else: clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix) #clusters = MSclusterer(cMatrix)#cMatrixdocMatrix for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 x = 0.0 scoreList[0] += truthCounter[True] deleteList = [] #print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100)) for combo in nnDict.keys(): if combo not in clusterCount.keys(): deleteList.append(combo) y = 0.0 for item in deleteList: if item in cList: y+=1 del nnDict[item] scores = sharedNN(docDict, nnDict) print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2))) for combo in clusterCount.most_common(20): if combo[0] in cList: x += 1 scoreList[1] += 1 print("prec: {}".format(x/20)) #print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True])) #print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values()))) #print("Total precision is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1])) if not os.path.exists('answers/'+problem): os.mkdir('answers/'+problem) clusDict = defaultdict(list) rankDict = defaultdict(list) for i, cluster in enumerate(list(clusters)): clusDict[cluster] .append({"document": docList[i]}) rankDict[cluster] .append(docList[i]) with open('answers/'+problem+'/clustering.json', "w") as jsonFile: ujson.dump(list(clusDict.values()), jsonFile, indent=4) rankList = [] for value in rankDict.values(): if len(value) > 1 : pairs = combinations(value,2) for pair in pairs: rankList.append({"document1": pair[0], "document2": pair[1], "score": scores[pair][0]}) with open('answers/'+problem+'/ranking.json', "w") as jsonFile: ujson.dump(rankList, jsonFile, indent=4)
def give_vocabulary(sentences_df): ''' @parameter: the dataframe from the json file with the 5 columns we need @returns: the vocabulary in a set. ''' vocabulary = [] list_of_sentences1 = sentences_df['sentence1'].tolist() list_of_sentences2 = sentences_df['sentence2'].tolist() list_sentence_words = [] ''' # Do same with keras for sentence in list_of_sentences1: sentence.lower() #tokenize or split by " " tokens1 = sentence.split(" ") for token1 in tokens1: if token1 not in vocabulary: vocabulary.append(token1) ''' list_sentence_word_tmp = [] for s1, s2 in zip (list_of_sentences1, list_of_sentences2): sentence_unicode1 = make_unicode(s1) sentence_unicode2 = make_unicode(s2) #print sentence_no_unicode list_sentence_word_tmp += text_to_word_sequence(sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ") list_sentence_word_tmp += text_to_word_sequence(sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ") set_words = set(list_sentence_word_tmp) #print word2idx print "length of vocabulary: %d"%len(set_words) return set_words
def fit_tokenizer(lines, max_words = None): tokenizer = Tokenizer(max_words, filters=base_filter().replace('_', '')) tokenizer.fit_on_texts(lines) return tokenizer
from keras.models import Sequential from keras.layers import Dense, Embedding, Lambda from keras.utils.data_utils import get_file from keras.utils import np_utils from keras.utils.visualize_util import model_to_dot, plot from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer, base_filter from gensim.models.doc2vec import Word2Vec ### k.mean( )で必要になるので追加 from keras import backend as K # In[6]: base_filter() path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt") corpus = open(path).readlines()[0:200] ### new line more than 2 を削除 corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2] ### base_filter に ' を追加 tokenizer = Tokenizer(filters=base_filter()+"'") ### corpus を整形 tokenizer.fit_on_texts(corpus) ### corpus の型を sequence に変更(file 出力できなかった) corpus = tokenizer.texts_to_sequences(corpus)
def test(text): print keras.preprocessing.text.text_to_word_sequence(text, filters=base_filter(), lower=True, split=" ") print keras.preprocessing.text.one_hot(text, 5, filters=base_filter(), lower=True, split=" ")
filter_length = 5 nb_filter = 64 pool_length = 4 # LSTM lstm_output_size = 100 # Training batch_size = 100 nb_epoch = 3 X = [x[1] for x in labeled_sample] y = [x[0] for x in labeled_sample] tk = text.Tokenizer(nb_words=2000, filters=text.base_filter(), lower=True, split=" ") tk.fit_on_texts(X) X = tk.texts_to_sequences(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) model = Sequential() model.add(Embedding(max_features, embedding_size, input_length=maxlen)) model.add(Dropout(0.4)) model.add(
def read_data(filename): """Extract the first file enclosed in a zip file as a list of words""" lines = [] with open(filename) as f: for line in f: line_processed = tf.compat.as_str(line).decode('utf-8').encode( "utf-8") lines.append(line_processed) if len(lines) == max_lines: break return lines tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ") # tokenizer = Tokenizer(nb_words=None) filename = "data/europarl-v7.de-en.en" lines = read_data(filename) print('Lines:', len(lines)) X = [] Y = [] for line in lines: words = line.lower().strip().split(" ") if len(words) < max_sentence_length: continue sentence = []
with open(path, 'r') as f: yield f.read() len(text_files) ### # our corpus is small enough where we # don't need to worry about this, but good practice max_vocab_size = 50000 # `filters` specify what characters to get rid of # `base_filter()` includes basic punctuation; # I like to extend it with common unicode punctuation tokenizer = Tokenizer(nb_words=max_vocab_size, filters=base_filter()) # fit the tokenizer tokenizer.fit_on_texts(text_generator()) # we also want to keep track of the actual vocab size # we'll need this later # note: we add one because `0` is a reserved index in keras' tokenizer vocab_size = len(tokenizer.word_index) + 1 ### embedding_dim = 256 ###
parser.add_argument("--max_words", type=int) parser.add_argument("--maxlen", type=int) args = parser.parse_args() print "Loading data..." with open(args.data) as f: lines = f.readlines() print "Lines:", len(lines) print "Sample question, correct answer, incorrect answer:" for i in xrange(3): print lines[i] print "Tokenizing data..." tokenizer = Tokenizer(args.max_words, filters=base_filter().replace('_', '')) tokenizer.fit_on_texts(lines) print "Number of words: ", len(tokenizer.word_index) if args.tokenizer_save_path: print "Saving tokenizer to %s..." % args.tokenizer_save_path pickle.dump(tokenizer, open(args.tokenizer_save_path, "wb"), pickle.HIGHEST_PROTOCOL) wcounts = tokenizer.word_counts.items() wcounts.sort(key=lambda x: x[1], reverse=True) print "Most frequent words:", wcounts[:10] print "Most rare words:", wcounts[-10:] print "Number of words occurring %d times:" % wcounts[-1][1], np.sum(np.array(tokenizer.word_counts.values())==wcounts[-1][1]) print "Converting text to sequences..."
def bigram_one_hot(text, n, bigram, filters=base_filter(), lower=False, split=" "): seq = bigram_text_to_word_sequence(text, bigram, filters=filters, lower=lower, split=split) return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
from IPython.display import SVG from keras.models import Sequential from keras.layers import Dense, Embedding, Lambda from keras.utils.data_utils import get_file from keras.utils import np_utils from keras.utils.visualize_util import model_to_dot, plot from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer, base_filter from gensim.models.doc2vec import Word2Vec ### k.mean( )で必要になるので追加 from keras import backend as K # In[6]: base_filter() path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt") corpus = open(path).readlines()[0:200] ### new line more than 2 を削除 corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2] ### base_filter に ' を追加 tokenizer = Tokenizer(filters=base_filter() + "'") ### corpus を整形 tokenizer.fit_on_texts(corpus) ### corpus の型を sequence に変更(file 出力できなかった) corpus = tokenizer.texts_to_sequences(corpus)
# a = ["a d d", "d a"] # a = ["我是一个爱生活的人", "他也是一个爱生活的人"] # one_h = one_hot(filters=base_filter(), n=30, text=a) # # o.fit_on_texts(a) # # b = one_h(a) # print one_hot(filters=base_filter(), n=30, text=a) # print one_hot(filters=base_filter(), n=30, text=a) # a=['hello world', 'foo bar'] # tokenizer = Tokenizer() # train_tokens = tokenizer.fit_transform(a) # print train_tokens # comma_tokenizer = lambda x: jieba.cut(x, cut_all=True) # from sklearn.feature_extraction.text import HashingVectorizer # v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True) # train_data = v.fit_transform(a) # print train_data # import jieba a = "我是一个男孩" c = jieba.cut(a, cut_all=False) w = "" # print(", ".join(c)) for i in c: w += i + " " # print i w = w[:len(w)-1].encode('utf8') # w = "我 是 一个男孩" print one_hot(filters=base_filter(), n=30000, text=w) # print w # # print c.next()
parser.add_argument("--max_words", type=int) parser.add_argument("--maxlen", type=int) args = parser.parse_args() print "Loading data..." with open(args.data) as f: lines = f.readlines() print "Lines:", len(lines) print "Sample question, correct answer, incorrect answer:" for i in xrange(3): print lines[i] print "Tokenizing data..." tokenizer = Tokenizer(args.max_words, filters=base_filter().replace('_', '')) tokenizer.fit_on_texts(lines) print "Number of words: ", len(tokenizer.word_index) if args.tokenizer_save_path: print "Saving tokenizer to %s..." % args.tokenizer_save_path pickle.dump(tokenizer, open(args.tokenizer_save_path, "wb"), pickle.HIGHEST_PROTOCOL) wcounts = tokenizer.word_counts.items() wcounts.sort(key=lambda x: x[1], reverse=True) print "Most frequent words:", wcounts[:10] print "Most rare words:", wcounts[-10:] print "Number of words occurring %d times:" % wcounts[-1][1], np.sum( np.array(tokenizer.word_counts.values()) == wcounts[-1][1])
# Convolution filter_length = 5 nb_filter = 64 pool_length = 4 # LSTM lstm_output_size = 100 # Training batch_size = 100 nb_epoch = 3 X = [x[1] for x in labeled_sample] y = [x[0] for x in labeled_sample] tk = text.Tokenizer(nb_words=2000, filters=text.base_filter(), lower=True, split=" ") tk.fit_on_texts(X) X = tk.texts_to_sequences(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) model = Sequential() model.add(Embedding(max_features, embedding_size, input_length=maxlen)) model.add(Dropout(0.4)) model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='tanh',