Exemplo n.º 1
0
def tokenization(all_texts):
    tokenizer = Tokenizer(filters=base_filter(), lower=True, split=" ")
    tokenizer.fit_on_texts(all_texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    return tokenizer, word_index
Exemplo n.º 2
0
 def __init__(self, field_name, nb_words=None, filters=base_filter(),
              lower=True, split=' ', char_level=False, arr_of_text = []):
     self.text = ""
     self.field_name = field_name
     self.arr_of_text = arr_of_text
     super(token,self).__init__(nb_words, filters, lower, split, char_level)
     self.create_text()
     self.fit_on_texts([self.text])
Exemplo n.º 3
0
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "):
    '''prune: sequence of characters to filter out
    '''
    if lower:
        text = text.lower()
    text = text.translate(string.maketrans(filters, split*len(filters)))
    seq = text.split(split)
    sentences = [_f for _f in seq if _f]
    return bigram(sentences)
Exemplo n.º 4
0
def convert_word_to_int(x_train, num_words, interp=True):
    x = []
    fun = None
    if interp:
        fun = no_filter()
    else:
        fun = base_filter()
    for row in x_train:
        x.append(one_hot(row, num_words, filters=fun))
    return x
Exemplo n.º 5
0
def give_vocabulary(sentences_df):
    '''
    @parameter: the dataframe from the json file with the 5 columns we need
    @returns: the vocabulary in a set.
    '''
    vocabulary = []
    list_of_sentences1 = sentences_df['sentence1'].tolist()
    list_of_sentences2 = sentences_df['sentence2'].tolist()
    list_sentence_words = []
    '''
    # Do same with keras
    for sentence in list_of_sentences1:
        sentence.lower()
        #tokenize or split by " "
        tokens1 = sentence.split(" ")
        for token1 in tokens1:
            if token1 not in vocabulary:
                vocabulary.append(token1)
    '''
    list_sentence_word_tmp = []
    for s1, s2 in zip(list_of_sentences1, list_of_sentences2):
        sentence_unicode1 = make_unicode(s1)
        sentence_unicode2 = make_unicode(s2)
        #print sentence_no_unicode
        list_sentence_word_tmp += text_to_word_sequence(
            sentence_unicode1.encode('ascii'),
            filters=base_filter(),
            lower=True,
            split=" ")
        list_sentence_word_tmp += text_to_word_sequence(
            sentence_unicode2.encode('ascii'),
            filters=base_filter(),
            lower=True,
            split=" ")

    set_words = set(list_sentence_word_tmp)
    word2idx = {}
    for i, word in enumerate(set_words):
        word2idx[word] = int(i)

    #print word2idx
    print "length of vocabulary: %d" % len(set_words)
    return set_words, len(set_words), word2idx
Exemplo n.º 6
0
        filename = "data/de-en.tgz"
        urllib.urlretrieve(url, filename)
        tar = tarfile.open(filename, "r:gz")
        tar.extractall("data/")
        tar.close()
    lines = []
    with open("data/europarl-v7.de-en.en") as f:
        for line in f:
            line_processed = tf.compat.as_str(line).decode('utf-8').encode("utf-8")
            lines.append(line_processed)
            if len(lines) == max_lines:
                break
    return lines


tokenizer = Tokenizer(nb_words=None, filters=base_filter(),
        lower=True, split=" ")
# tokenizer = Tokenizer(nb_words=None)

lines = read_data()
print('Lines:', len(lines))

X = []
Y = []
counter = Counter()
for line in lines:
    words = line.lower().strip().split(" ")
    if len(words) < input_dim:
        continue
    sentence = []
    y = None
Exemplo n.º 7
0
def main():
	scoreList = [0.0,0.0]
	with open('data/info.json') as j:
		info = ujson.load(j)
	for problem in os.listdir('data'):
		greek=False
		if problem.startswith('problem'):
			truthPath = 'data/truth/'+problem+'/clustering.json'
			with open(truthPath) as t:
				truth = ujson.load(t)
			print(problem)
			probTokList = []
			docList = []
			docDict = {}
			X=[]
			Y=[]

			path = 'data/' + problem
			for entry in info:
				if entry["folder"] == problem:
					lang=entry["language"]
					if entry["language"] == "gr":
						greek=True

			CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4))
			docs = [path+'/'+x for x in os.listdir(path)]
			cMatrix = CV.fit_transform(docs)
			for doc in os.listdir(path):
				docTokList = []
				with open(path + '/' + doc) as d:
						article = d.readlines()
						for sent in article:
							sentTokList = []
							for word in sent.split():
								for token in word:
									procToken = preprop(token,greek)
									sentTokList.append(procToken) #Every item of the list is a normalized character
							docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence
				probTokList.append(' '.join(docTokList))#Every item of the list is a document
				docList.append(doc)
			tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			tokenizer.fit_on_texts(probTokList)
			seqList = tokenizer.texts_to_sequences(probTokList)
			
			uniqueTokens = max([max(x) for x in seqList])

			print(uniqueTokens,lang)
			sampling_table = sequence.make_sampling_table(uniqueTokens+1)
			for i,seq in enumerate(seqList):
				x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table)
				x = zip(x, y)
				X.append(x)
				#Y.extend(y)
				docDict[docList[i]] = seq
			strX=[str(x) for x in X]
			xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			xTokenizer.fit_on_texts(strX)
			#docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf")
			docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf")
			#scores = embedNN(X,Y)
			pairs = combinations(docDict.keys(),2)
			cList = []
			nnDict = {}
			for cluster in truth:
				cPairs = []
				if len(cluster) > 1:
					for item in cluster:
						cPairs.append(str(item["document"]))
					cList.extend(list(permutations(cPairs,2)))
			for pair in pairs:
				match = False
				if pair in cList:
					match = True
				nnDict[pair] = match
			for i, doc in enumerate(docMatrix):
				docDict[docList[i]] = doc

			
			truthCounter =  Counter(nnDict.values())
			baseline = 1-float(truthCounter[True])/float(len(nnDict))
			print("Baseline for {} is {}".format(problem, baseline))
			clusterCount = Counter()
			kmclusters = False # Change to False for meanshift
			if kmclusters:
				pbar = ProgressBar()
				for nclusters in pbar(reversed(range(len(docMatrix)-1))):
					#print("{} Clusters".format(nclusters+1))
					clusters = KMclusterer(nclusters+1,cMatrix)
					for c in range(nclusters+1):
						#print(c,"has:",[i for i,x in enumerate(clusters) if x == c])
						for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)):
							combo = (docList[clusterpair[0]],docList[clusterpair[1]])
							clusterCount[combo] +=1
			else:
				clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix)
				#clusters = MSclusterer(cMatrix)#cMatrixdocMatrix
				for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)):
					combo = (docList[clusterpair[0]],docList[clusterpair[1]])
					clusterCount[combo] +=1

			x = 0.0 
			scoreList[0] += truthCounter[True]
			deleteList = []
			#print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100))
			for combo in nnDict.keys():
				if combo not in clusterCount.keys():
					deleteList.append(combo)
			y = 0.0
			for item in deleteList:
				if item in cList:
					y+=1
				del nnDict[item]
			scores = sharedNN(docDict, nnDict)
			print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2)))

			for combo in clusterCount.most_common(20):
				if combo[0] in cList:
					x += 1
					scoreList[1] += 1
			print("prec: {}".format(x/20))
			#print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True]))
			#print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values())))

	#print("Total precision  is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1]))


			if not os.path.exists('answers/'+problem):
				os.mkdir('answers/'+problem)
			clusDict = defaultdict(list)
			rankDict = defaultdict(list)
			for i, cluster in enumerate(list(clusters)):
				clusDict[cluster] .append({"document": docList[i]})
				rankDict[cluster] .append(docList[i])
			with open('answers/'+problem+'/clustering.json', "w") as jsonFile:
				ujson.dump(list(clusDict.values()), jsonFile, indent=4)
			rankList = []
			for value in rankDict.values():
				if len(value) > 1 :
					pairs = combinations(value,2)
				for pair in pairs:
					rankList.append({"document1": pair[0], "document2": pair[1], "score":  scores[pair][0]})
			with open('answers/'+problem+'/ranking.json', "w") as jsonFile:
				ujson.dump(rankList, jsonFile, indent=4)
Exemplo n.º 8
0
def give_vocabulary(sentences_df):
    '''
    @parameter: the dataframe from the json file with the 5 columns we need
    @returns: the vocabulary in a set.
    '''
    vocabulary = []
    list_of_sentences1 = sentences_df['sentence1'].tolist()
    list_of_sentences2 = sentences_df['sentence2'].tolist()
    list_sentence_words = []
    '''
    # Do same with keras
    for sentence in list_of_sentences1:
        sentence.lower()
        #tokenize or split by " "
        tokens1 = sentence.split(" ")
        for token1 in tokens1:
            if token1 not in vocabulary:
                vocabulary.append(token1)
    '''
    list_sentence_word_tmp = []
    for s1, s2 in zip (list_of_sentences1, list_of_sentences2):
        sentence_unicode1 = make_unicode(s1)
        sentence_unicode2 = make_unicode(s2)
        #print sentence_no_unicode
        list_sentence_word_tmp += text_to_word_sequence(sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ")
        list_sentence_word_tmp += text_to_word_sequence(sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ")

    set_words = set(list_sentence_word_tmp)
    #print word2idx
    print "length of vocabulary: %d"%len(set_words)
    return set_words
Exemplo n.º 9
0
def fit_tokenizer(lines, max_words = None):
  tokenizer = Tokenizer(max_words, filters=base_filter().replace('_', ''))
  tokenizer.fit_on_texts(lines)
  return tokenizer
Exemplo n.º 10
0
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.visualize_util import model_to_dot, plot
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, base_filter
from gensim.models.doc2vec import Word2Vec

### k.mean( )で必要になるので追加
from keras import backend as K


# In[6]:

base_filter()
path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
corpus = open(path).readlines()[0:200]

### new line more than 2 を削除
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

### base_filter に ' を追加
tokenizer = Tokenizer(filters=base_filter()+"'")

### corpus を整形
tokenizer.fit_on_texts(corpus)

### corpus の型を sequence に変更(file 出力できなかった)
corpus = tokenizer.texts_to_sequences(corpus)
Exemplo n.º 11
0
def test(text):
	print keras.preprocessing.text.text_to_word_sequence(text, 
	filters=base_filter(), lower=True, split=" ")
	print keras.preprocessing.text.one_hot(text, 5,
	filters=base_filter(), lower=True, split=" ")
Exemplo n.º 12
0
filter_length = 5
nb_filter = 64
pool_length = 4

# LSTM
lstm_output_size = 100

# Training
batch_size = 100
nb_epoch = 3

X = [x[1] for x in labeled_sample]
y = [x[0] for x in labeled_sample]

tk = text.Tokenizer(nb_words=2000,
                    filters=text.base_filter(),
                    lower=True,
                    split=" ")
tk.fit_on_texts(X)
X = tk.texts_to_sequences(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.4))
model.add(
Exemplo n.º 13
0
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    lines = []
    with open(filename) as f:
        for line in f:
            line_processed = tf.compat.as_str(line).decode('utf-8').encode(
                "utf-8")
            lines.append(line_processed)
            if len(lines) == max_lines:
                break
    return lines


tokenizer = Tokenizer(nb_words=None,
                      filters=base_filter(),
                      lower=True,
                      split=" ")
# tokenizer = Tokenizer(nb_words=None)

filename = "data/europarl-v7.de-en.en"
lines = read_data(filename)
print('Lines:', len(lines))

X = []
Y = []
for line in lines:
    words = line.lower().strip().split(" ")
    if len(words) < max_sentence_length:
        continue
    sentence = []
Exemplo n.º 14
0
        with open(path, 'r') as f:
            yield f.read()


len(text_files)

###

# our corpus is small enough where we
# don't need to worry about this, but good practice
max_vocab_size = 50000

# `filters` specify what characters to get rid of
# `base_filter()` includes basic punctuation;
# I like to extend it with common unicode punctuation
tokenizer = Tokenizer(nb_words=max_vocab_size, filters=base_filter())

# fit the tokenizer
tokenizer.fit_on_texts(text_generator())

# we also want to keep track of the actual vocab size
# we'll need this later
# note: we add one because `0` is a reserved index in keras' tokenizer
vocab_size = len(tokenizer.word_index) + 1

###

embedding_dim = 256

###
Exemplo n.º 15
0
parser.add_argument("--max_words", type=int)
parser.add_argument("--maxlen", type=int)
args = parser.parse_args()

print "Loading data..."
with open(args.data) as f:
  lines = f.readlines()

print "Lines:", len(lines)

print "Sample question, correct answer, incorrect answer:"
for i in xrange(3):
    print lines[i]

print "Tokenizing data..."
tokenizer = Tokenizer(args.max_words, filters=base_filter().replace('_', ''))
tokenizer.fit_on_texts(lines)
print "Number of words: ", len(tokenizer.word_index)

if args.tokenizer_save_path:
  print "Saving tokenizer to %s..." % args.tokenizer_save_path
  pickle.dump(tokenizer, open(args.tokenizer_save_path, "wb"), pickle.HIGHEST_PROTOCOL)

wcounts = tokenizer.word_counts.items()
wcounts.sort(key=lambda x: x[1], reverse=True)
print "Most frequent words:", wcounts[:10]
print "Most rare words:", wcounts[-10:]

print "Number of words occurring %d times:" % wcounts[-1][1], np.sum(np.array(tokenizer.word_counts.values())==wcounts[-1][1])

print "Converting text to sequences..."
Exemplo n.º 16
0
def bigram_one_hot(text, n, bigram, filters=base_filter(), lower=False, split=" "):
    seq = bigram_text_to_word_sequence(text, bigram, filters=filters, lower=lower, split=split)
    return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
Exemplo n.º 17
0
from IPython.display import SVG
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.visualize_util import model_to_dot, plot
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, base_filter
from gensim.models.doc2vec import Word2Vec

### k.mean( )で必要になるので追加
from keras import backend as K

# In[6]:

base_filter()
path = get_file('alice.txt',
                origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
corpus = open(path).readlines()[0:200]

### new line more than 2 を削除
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

### base_filter に ' を追加
tokenizer = Tokenizer(filters=base_filter() + "'")

### corpus を整形
tokenizer.fit_on_texts(corpus)

### corpus の型を sequence に変更(file 出力できなかった)
corpus = tokenizer.texts_to_sequences(corpus)
Exemplo n.º 18
0
# a = ["a d d", "d a"]
# a = ["我是一个爱生活的人", "他也是一个爱生活的人"]
# one_h = one_hot(filters=base_filter(), n=30, text=a)
# # o.fit_on_texts(a)
# # b = one_h(a)
# print one_hot(filters=base_filter(), n=30, text=a)
# print one_hot(filters=base_filter(), n=30, text=a)

# a=['hello world', 'foo bar']
# tokenizer = Tokenizer()
# train_tokens = tokenizer.fit_transform(a)
# print train_tokens
# comma_tokenizer = lambda x: jieba.cut(x, cut_all=True)
# from sklearn.feature_extraction.text import HashingVectorizer
# v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True)
# train_data = v.fit_transform(a)
# print train_data

# import jieba
a = "我是一个男孩"
c = jieba.cut(a, cut_all=False)
w = ""
# print(", ".join(c))
for i in c:
    w += i + " "
    # print i
w = w[:len(w)-1].encode('utf8')
# w = "我 是 一个男孩"
print one_hot(filters=base_filter(), n=30000, text=w)
# print w
# # print c.next()
Exemplo n.º 19
0
parser.add_argument("--max_words", type=int)
parser.add_argument("--maxlen", type=int)
args = parser.parse_args()

print "Loading data..."
with open(args.data) as f:
    lines = f.readlines()

print "Lines:", len(lines)

print "Sample question, correct answer, incorrect answer:"
for i in xrange(3):
    print lines[i]

print "Tokenizing data..."
tokenizer = Tokenizer(args.max_words, filters=base_filter().replace('_', ''))
tokenizer.fit_on_texts(lines)
print "Number of words: ", len(tokenizer.word_index)

if args.tokenizer_save_path:
    print "Saving tokenizer to %s..." % args.tokenizer_save_path
    pickle.dump(tokenizer, open(args.tokenizer_save_path, "wb"),
                pickle.HIGHEST_PROTOCOL)

wcounts = tokenizer.word_counts.items()
wcounts.sort(key=lambda x: x[1], reverse=True)
print "Most frequent words:", wcounts[:10]
print "Most rare words:", wcounts[-10:]

print "Number of words occurring %d times:" % wcounts[-1][1], np.sum(
    np.array(tokenizer.word_counts.values()) == wcounts[-1][1])
Exemplo n.º 20
0
# Convolution
filter_length = 5
nb_filter = 64
pool_length = 4

# LSTM
lstm_output_size = 100

# Training
batch_size = 100
nb_epoch = 3

X = [x[1] for x in labeled_sample]
y = [x[0] for x in labeled_sample]

tk = text.Tokenizer(nb_words=2000, filters=text.base_filter(),
                    lower=True, split=" ")
tk.fit_on_texts(X)
X = tk.texts_to_sequences(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.4))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='tanh',