def parse(Masterdir, filename, seperator, datacol, labelcol, labels): # Reads the files and splits data into individual lines f = open(Masterdir + filename, 'r', encoding='UTF-8') lines = f.read().lower() lines = lines.lower().split('\n')[:-1] print(lines) X_train = [] Y_train = [] # Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) # Token is the function which implements basic preprocessing as mentioned in our paper tokenized_lines = token(line[datacol]) X_train.append(tokenized_lines) # Appends labels if line[labelcol] == labels[0]: Y_train.append(0) if line[labelcol] == labels[1]: Y_train.append(1) if line[labelcol] == labels[2]: Y_train.append(2) # Converts Y_train to a numpy array Y_train = np.asarray(Y_train) assert (len(X_train) == Y_train.shape[0]) return [X_train, Y_train]
def parse1(Masterdir1, filename1, seperator1, datacol1, labelcol1, labels1): """ #datacol表示句子 Purpose -> Data I/O Input -> Data file containing sentences and labels along with the global variables Output -> Sentences cleaned up in list of lists format along with the labels as a numpy array """ # Reads the files and splits data into individual lines #f = open(Masterdir1 + Datadir1 + filename1, 'r',encoding='utf-8') f = open('E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task1/翻译test+lable.txt', 'r', encoding='utf-8') lines = f.read().lower() #print(lines) lines = lines.lower().split('\n')[:-1] #print(lines) X_test = [] Y_test = [] # Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator1) # Token is the function which implements basic preprocessing as mentioned in our paper for l in line: re = line[0] + '\n' # fd = open('E:/复现/BAKSA_IITK-master/Baseline/Sub-word-LSTM/pred/digt.csv', 'a') # fd.write(re)##145条test tokenized_lines = token(line[datacol1]) #print('xxxxxx',line[datacol])#105条 #print(tokenized_lines)#105条 # Creates character lists char_list = [] for words in tokenized_lines: for char in words: char_list.append(char) char_list.append(' ') #print(char_list) #- Debugs the character list created X_test.append(char_list) #Appends labels if line[labelcol1] == labels1[0]: Y_test.append(0) if line[labelcol1] == labels1[1]: Y_test.append(1) # for line in lines: # i=0 # Y_test.append(i) #print(Y_test) # Converts Y_train to a numpy array Y_test = np.asarray(Y_test) assert (len(X_test) == Y_test.shape[0]) return [X_test, Y_test]
def parse(Masterdir, filename, seperator, datacol, labelcol, labels): """ Purpose -> Data I/O Input -> Data file containing sentences and labels along with the global variables Output -> Sentences cleaned up in list of lists format along with the labels as a numpy array """ # Reads the files and splits data into individual lines #f = open(Masterdir + Datadir + filename,'r',encoding='utf-8') f = open( 'E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task1/翻译ml-Hasoc-offensive-train.txt', 'r', encoding='utf-8') lines = f.read().lower() # print(lines) lines = lines.lower().split('\n')[:-1] # print(lines) X_train = [] Y_train = [] D = [] # Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) # print(line) # for l in line: # re = line[0] + '\n' # # print(re) # fd = open('/home/lab1510/Desktop/Sub-word-LSTM-master/Data/digt.csv', 'w') # fd.write(re) # Token is the function which implements basic preprocessing as mentioned in our paper tokenized_lines = token(line[datacol]) # Creates character lists char_list = [] for words in tokenized_lines: for char in words: char_list.append(char) char_list.append(' ') # print(char_list) #- Debugs the character list created X_train.append(char_list) # print(X_train) # Appends labels if line[labelcol] == labels[0]: Y_train.append(0) if line[labelcol] == labels[1]: Y_train.append(1) # Converts Y_train to a numpy array Y_train = np.asarray(Y_train) assert (len(X_train) == Y_train.shape[0]) return [X_train, Y_train]
def parse(Masterdir, filename, seperator, datacol, labelcol, labels): """ Purpose -> Data I/O Input -> Data file containing sentences and labels along with the global variables Output -> Sentences cleaned up in list of lists format along with the labels as a numpy array """ # Reads the files and splits data into individual lines f = open(Masterdir + filename, 'r', encoding='UTF-8') lines = f.read().lower() lines = lines.lower().split('\n')[:-1] print(lines) X_train = [] Y_train = [] # Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) # Token is the function which implements basic preprocessing as mentioned in our paper tokenized_lines = token(line[datacol]) # print(tokenized_lines) # Creates character lists # char_list = [] # sentence = [] # for words in tokenized_lines: # for char in words: # char_list.append(char) # sentence.append(char_list) # print(sentence) # # print(char_list) - Debugs the character list created X_train.append(tokenized_lines) # print(X_train) # Appends labels if line[labelcol] == labels[0]: Y_train.append(0) if line[labelcol] == labels[1]: Y_train.append(1) if line[labelcol] == labels[2]: Y_train.append(2) # Converts Y_train to a numpy array Y_train = np.asarray(Y_train) # print(Y_train) assert (len(X_train) == Y_train.shape[0]) return [X_train, Y_train]
def parse(Masterdir, filename, seperator, datacol, labelcol, labels): """ Purpose -> Data I/O Input -> Data file containing sentences and labels along with the global variables Output -> Sentences cleaned up in list of lists format along with the labels as a numpy array """ #Reads the files and splits data into individual lines f = open( 'E:/复现/BAKSA_IITK-master/HASOC_Off/Data/task2/Malayalam_offensive_data_Training-YT.txt', 'r', encoding='utf-8') lines = f.read().lower() lines = lines.lower().split('\n')[:-1] lines = lines[1:] X_train = [] Y_train = [] #Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) #Token is the function which implements basic preprocessing as mentioned in our paper tokenized_lines = token(line[datacol]) # print(tokenized_lines) #Creates character lists char_list = [] for words in tokenized_lines: for char in words: char_list.append(char) char_list.append(' ') #print(char_list) - Debugs the character list created X_train.append(char_list) #Appends labels if line[labelcol] == labels[0]: Y_train.append(0) if line[labelcol] == labels[1]: Y_train.append(1) #Converts Y_train to a numpy array Y_train = np.asarray(Y_train) assert (len(X_train) == Y_train.shape[0]) return [X_train, Y_train]
def parsetest(Masterdir, filename, seperator, datacol, idlcol): # Reads the files and splits data into individual lines f = open(Masterdir + filename, 'r', encoding='UTF-8') lines = f.read().lower() lines = lines.lower().split('\n')[:-1] print(lines) X_test = [] id_test = [] # Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) tokenized_lines = token(line[datacol]) X_test.append(tokenized_lines) id_test.append(line[idlcol]) return [X_test, id_test]
'r+') json_string = f.read() f.close() model = model_from_json(json_string) model.load_weights(Masterdir + Modeldir + 'LSTM_' + experiment_details + '_weights.h5') model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy']) while (1): inp_sent = raw_input('Enter a sentence. Press \'Q\' to exit.\n') if inp_sent == "Q": break inp_sent = token(inp_sent) X_test = [] temp = [] for words in inp_sent: for char in words: temp.append(mapping_char2num[char]) temp.append(mapping_char2num[' ']) X_test.append(temp) X_test = np.asarray(X_test) print(X_test.shape) X_test = sequence.pad_sequences(X_test[:], maxlen=200) print(X_test.shape) #score, acc = model.evaluate(X_test, y_test2, batch_size=batch_size) y_pred = model.predict_classes(X_test, batch_size=batch_size)
def parse(Masterdir, filename, seperator, datacol, labelcol, labels): """ Purpose -> Data I/O Input -> Data file containing sentences and labels along with the global variables Output -> Sentences cleaned up in list of lists format along with the labels as a numpy array """ #Reads the files and splits data into individual lines f = open(Masterdir + Datadir + filename, 'r') lines = f.read().lower() lines = lines.lower().split('\n')[:-1] # f=codecs.open(Masterdir+Datadir+filename, 'r', 'utf-8') # lines=f.read() #lines = f.readlines() # lines = f.read() # lines = to_unicode_repr(lines) # print("lines", lines) # lines = utf8.get_letters(lines) # lines = lines.lower().split('\n')[:-1] # words = utf8.get_words(lines) #print("words", words) # output = tamil.tscii.convert_to_unicode(f.read()) # print("Output",output) # lines = split_content_to_sentences(datacol) # print("datacol", words[100]) # istamil = utf8.is_normalized(words[100]) # print("istamil", istamil) # letters = lines[50] # print("letters:", letters[100]) # u = u'letters[100]' # print("Unicode: ",letters[100].encode('utf-8')) # print("Unicodetry:",tamil.tscii.convert_to_unicode(letters[100])) # # print("uni:", unicode(letters[100], 'utf-8')) # # print("Decimal: ",decimal(letters[100].encode('utf-8'))) # res = istamil_prefix(lines[50]) # print("res: ", res) # reseng = has_english(lines) # print("reseng: ", reseng) X_train = [] Y_train = [] #Processes individual lines for line in lines: # Seperator for the current dataset. Currently '\t'. line = line.split(seperator) #Token is the function which implements basic preprocessing as mentioned in our paper # print("datacol: ",line[datacol]) tokenized_lines = token(line[datacol]) # tokenized_lines = split_content_to_sentences(line[datacol]) #Creates character lists char_list = [] for words in tokenized_lines: for char in words: char_list.append(char) char_list.append(' ') #print(char_list) - Debugs the character list created X_train.append(char_list) #Appends labels if line[labelcol] == labels[0]: Y_train.append(0) if line[labelcol] == labels[1]: Y_train.append(1) if line[labelcol] == labels[2]: Y_train.append(2) #Converts Y_train to a numpy array Y_train = np.asarray(Y_train) assert (len(X_train) == Y_train.shape[0]) return [X_train, Y_train]