def set_database():
	"""
	This function sets up the database for the corpus documents
	"""
	corpus="/home/stark/git/Different-news-articles-classified/for_demo/Corpus"
	ch=raw_input("1.Set corpus path\n2.Use default path\n")
	if ch==1:
		corpus=raw_input("Enter complete path of corpus:")
	cwd=os.getcwd()
	db=createdb.database()
	doc_list=BOW(db,corpus,cwd)
	N=db.get_no_of_doc()
	Total_words=db.get_total_words()
	print "Number of documents in corpus:"+str(N)
	print "Total number of words:"+str(Total_words)
	fil=open("query_setup.txt","w")
	fil.write(str(N)+"\n")
	fil.write(str(Total_words)+"\n")
	fil.close()
	fil2=open("doc_map.txt","w")
	temp=[(k,v) for k,v in doc_list.items()]
	for k,v in temp:
		fil2.write(str(k)+" "+str(v)+"\n")
	fil2.close()	
Пример #2
0
def set_database():
    """
	This function sets up the database for the corpus documents
	"""
    corpus = "/home/mudit/git_repo/News/Corpus"
    ch = raw_input("1.Set corpus path\n2.Use default path\n")
    if ch == 1:
        corpus = raw_input("Enter complete path of corpus:")
    cwd = os.getcwd()
    db = createdb.database()
    doc_list = BOW(db, corpus, cwd)
    N = db.get_no_of_doc()
    Total_words = db.get_total_words()
    print "Number of documents in corpus:" + str(N)
    print "Total number of words:" + str(Total_words)
    fil = open("query_setup.txt", "w")
    fil.write(str(N) + "\n")
    fil.write(str(Total_words) + "\n")
    fil.close()
    fil2 = open("doc_map.txt", "w")
    temp = [(k, v) for k, v in doc_list.items()]
    for k, v in temp:
        fil2.write(str(k) + " " + str(v) + "\n")
    fil2.close()
Пример #3
0
    def load_data(self, file, validate=True, label=True):

        fname = file[:-5]

        ### THIS BLOCK USED TO PREPROCESS TRAINING DATA ###
        all_avail = pandas.read_json(open(file, 'r'))
        all_avail['text'] = all_avail['text'].apply(func=self.preprocess)
        pickle.dump(all_avail, open('pickles/' + fname + '.pickle', 'wb'))

        ### UNCOMMENT TO LOAD SAVED PREPROCESSED FILES
        #all_avail = pickle.load(open('pickles/'+fname + '.pickle', 'rb'))
        print('Finished preprocessing')

        # 80-20 Split for training and development
        if validate:
            train_set = all_avail.sample(frac=0.8, random_state=1)
            dev_set = all_avail.drop(train_set.index)
        else:
            train_set = all_avail

        # Instantiate bag-of-words object
        bow = BOW(train_set)

        # Create vocabulary
        bow.create_bigram_vocabulary(500)
        bow.create_vocabulary(500)
        print('Finished BOW')

        self.feats = FeatureExtractor(bow)
        train_X, train_label = self.feats.df_to_feats_skl(train_set, label)

        joblib.dump(self.feats, 'pickles/feats.pickle')
        if validate:
            dev_X, dev_label = self.feats.df_to_feats_skl(dev_set, label)
            return train_X, train_label, dev_X, dev_label

        else:
            return train_X, train_label
Пример #4
0
sen_len = 40
fix_embedding = True  # fix embedding during training
batch_size = 128
epoch = 30
lr = 0.0001
# model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model
model_dir = path_prefix  # model directory for checkpoint model

print("loading data ...")  # 把'training_label.txt'跟'training_nolabel.txt'讀進來
train_x, y = utils.load_training_data(train_with_label)
train_x_no_label = utils.load_training_data(train_no_label)
test_x = utils.load_testing_data(testing_data)

# 對input跟labels做預處理
max_len = 1200
b = BOW(max_len=max_len)
b.bow(train_x, test_x)
train_x = b['train']
#import pdb
#pdb.set_trace()
y = [int(label) for label in y]
y = torch.LongTensor(y)

# 製作一個model的對象
model = D_Net(embedding_dim=max_len, num_layers=1)
model = model.to(
    device)  # device為"cuda",model使用GPU來訓練(餵進去的inputs也需要是cuda tensor)

# 把data分為training data跟validation data(將一部份training data拿去當作validation data)
X_train, X_val, y_train, y_val = train_x[:190000], train_x[
    190000:], y[:190000], y[190000:]
Пример #5
0
correctedData = correctedData.values

print("-------------CREANDO INDICES DE TEST Y TRAIN------------------")
#indxTest,indxTrain,indxVal = utils.separate_dataset(correctedData,cantidad_preg)
#cant_test = len(indxTest)
#cant_train = len(indxTrain)
#cant_val = len(indxVal)
#print("cantidad de patrones de prueba: ",cant_test)
#print("cantidad de patrones de entrenamiento: ",cant_train)
#print("cantidad de patrones de validacion: ",cant_val)
print(
    "-------------FINISHED CREANDO INDICES DE TEST Y TRAIN------------------\n"
)
stoplist = stopwords.words('spanish')
text = correctedData[:, 1]
bow_unigram = BOW(text, 'ascii', stoplist, weighting=True)

print(
    "-------------CREANDO Ytest e Ytrain (groundtruth de cada subconjunto), Xtext_test y Xtext_train-------------------"
)
Y = np.zeros((cantidad_preg), dtype=np.int64)
for i in range(cantidad_preg):
    Y[i] = correctedData[i, 0]

Y = torch.from_numpy(Y)
print(Y)
"""
#assert cant_test + cant_train == cantidad_preg 
#Ytrain = np.zeros((cant_train,1), dtype=np.int64)
Ytrain = np.zeros((cant_train),dtype=np.int64) #clases de los patrones que estan en el subconjunto de train
Xtrain = np.zeros((cant_train, bow_unigram.X.shape[1]), dtype=np.float) #opcion p mejorar performance, inicializar como sparse cada matriz X
Пример #6
0
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import torch
from sklearn.metrics import balanced_accuracy_score
from mpl_toolkits import mplot3d
import scipy.interpolate as interp
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import GradientBoostingClassifier

correctedData = pn.read_csv(
    "C:/Users/lucy/chatbot/preprocessedQuestions_lem_completadas.csv",
    delimiter=',',
    header=None)  #comentar esta linea en caso de descomentar la anterior
cantidad_preg = correctedData.shape[0]
correctedData = correctedData.values
stoplist = stopwords.words('spanish')
bow_unigram = BOW(correctedData[:, 1], 'ascii', stoplist, weighting=False)

Y = np.zeros((cantidad_preg), dtype=np.int64)
for i in range(cantidad_preg):
    Y[i] = correctedData[i, 0]

Y = torch.from_numpy(Y)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
Пример #7
0
#trainX_RF = pn.read_csv("trainX_RF.csv",header=None,delimiter=',').values
#trainY_RF = pn.read_csv("trainY_RF.csv",header=None,delimiter=',').values
#testX_RF = pn.read_csv("testX_RF.csv",header=None,delimiter=',').values
#testY_RF = pn.read_csv("testY_RF.csv",header=None,delimiter=',').values

correctedData = pn.read_csv("C:/Users/lucy/chatbot/preprocessedQuestions_lem.csv",delimiter=',') #comentar esta linea en caso de descomentar la anterior
cantidad_preg = correctedData.shape[0]
correctedData = correctedData.values
print(type(correctedData))
print(correctedData.dtype)
Xtrain_text,trainY_RF,Xtest_text,testY_RF,_,_ = utils.separate_dataset(correctedData,cantidad_preg,validation=False)
# print('Hasta acá todo ok')
# Instantiate model 
stoplist = stopwords.words('spanish')
print(Xtrain_text.shape)
bow_unigram = BOW(Xtrain_text.ravel(),'ascii',stoplist,weighting = False)
trainX_RF = bow_unigram.X
testX_RF = bow_unigram.vectorizer.transform(Xtest_text.ravel())
print('Training Features Shape:', trainX_RF.shape)#num_patrones x num_caracteristicas
print('Training Labels Shape:', trainY_RF.shape)#vector columna
print('Testing Features Shape:', testX_RF.shape) 
print('Testing Labels Shape:', testY_RF.shape)#vector columna

# Import matplotlib for plotting and use magic command for Jupyter Notebooks


# # Set the style
# plt.style.use('fivethirtyeight

print('Instancié el modelo...')
# rf = RandomForestRegressor(n_estimators= 10000, random_state=42)
Пример #8
0
        train_set = json.load(open(train_file, 'r'))
        print('Loaded Json')
        train_set_preprocessed = preprocess(train_set)
    else:
        train_set_preprocessed = pickle.load(
            open('train_set_stop.pickle', 'rb'))
        print('Loaded from Pickle')

    #80-20 split for train and dev
    train_percent = 80
    train_value = math.floor(.8 * len(train_set_preprocessed))

    train_set = train_set_preprocessed[:train_value]
    dev_set = train_set_preprocessed[train_value:]

    bow = BOW(train_set_preprocessed)
    bow.calculate_independent_words()
    pickle.dump(bow, open('bow.pickle', 'wb'))
    print('Finished BOW')

    train = [(feature_extractor.extract_all(x), int(x['stars']))
             for x in train_set]
    dev = [(feature_extractor.extract_all(x), int(x['stars']))
           for x in dev_set]

    print(train[:5])

    model = NBModel(train, dev)
    print(model.validate())
    print(model.informative_features())
Пример #9
0
    args = parser.parse_args()
    for arg in vars(args):
        print(arg, getattr(args, arg))

    if args.task in ("tagging"):

        data = TaggingData(args)

        model = MajorityVote()
        model.train(data)

        print("Storing model and data")
        with open(args.model, "wb") as F:
            pickle.dump(model, F)
        with open(args.data_file, "wb") as F:
            pickle.dump(data, F)

    elif args.task in ("classification"):
        data = ClassifyData(args)
        model = BOW()
        model.train(data)


        print("Storing model and data")
        with open(args.model, "wb") as F:
            pickle.dump(model, F)
        with open(args.data_file, "wb") as F:
            pickle.dump(data, F)


Пример #10
0
# define dataset & dataloader
##########
train_dataset = MSADataset(dataset['train'][0], dataset['train'][1], corpus)
valid_dataset = MSADataset(dataset['valid'][0], dataset['valid'][1], corpus)
print("training data = {}".format(len(train_dataset)))
print("validation data = {}".format(len(valid_dataset)))

MSADataLoader = partial(DataLoader, collate_fn=collate2)
train_dataloader = MSADataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = MSADataLoader(valid_dataset, batch_size=batch_size)


##########
# define model
##########
model = BOW(len(vocab), embed_size, len(corpus))
model = torch.load("model3.p")
cuda = torch.cuda.is_available()
if cuda:
    model.cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

def hasnan(x):
    n = (x.data != x.data).sum()
    return n != 0


def clip_grad(grad, clip=20):
    thres = torch.ones(grad.data.size())*clip
    if cuda:
        thres = thres.cuda()