def main(): # read the data: print("reading data...") trainingData, trainingLabels = readTrainingData() testingData, testingLabels = readTestingData() print("done") #tokanize: print("tokanize...") trainingTokens = tokanize(trainingData) testingTokens = tokanize(testingData) #print(trainingTokens[:5]) print("done") # clean data print("cleaning the data...") trainingTokens = clean(trainingTokens) testingTokens = clean(testingTokens) print("done") # #write vocabulary print("creating vocabulary") f = open("../data/vocab.json", "w") vocab = getVocabulary(trainingTokens) json_vocab = json.dumps(vocab) f.write(json_vocab) print(len(vocab)) #read vocabulary # print("load vocabulary...") # f = open("../data/vocab.json", "r") # with open("../data/vocab.json") as vocab_file: # vocab = json.load(vocab_file) # print(len(vocab)) # print("done!") # create vector print("creating bag of words") words_vector = getBOW(trainingTokens, vocab) words_vector_test = getBOW(testingTokens, vocab) #print(words_vector[2]) print("done") #create model #logistic regression: #clf = LogisticRegression(verbose = True, random_state=0) #decision tree: #clf = tree.DecisionTreeClassifier() #Multy-layer Perceptron: clf = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(112, 10), random_state=1) #print("training the model...") clf.fit(words_vector, trainingLabels) #print("done") score = clf.score(words_vector_test, testingLabels) print("score: "+ str(score))
def main(): parser = argparse.ArgumentParser() parser.add_argument('letters', metavar='N', type=str, nargs='+', help='gather strings') args = parser.parse_args() question = np.array(args.letters) question_range = range(3, question.shape[0]+1) # words = reader.read('../english-words/words_dictionary.json') words = reader.request('https://raw.githubusercontent.com/dwyl/english-words/master/words_dictionary.json') if not words: print('Cannot Find Words') sys.exit(1) answers = [] for i in question_range: combination = combinations(question, i) for combi in combination: clean = process.clean(combi) equivalents = process.words(words, clean) for equivalent in equivalents: answers.append(equivalent['words']) pprint.pprint(answers) return
def lda(): # Models topics in the documents using LDA doc_clean = [clean(doc).split() for doc in doc_complete] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] Lda = gensim.models.ldamodel.LdaModel myTopicModel = Lda(doc_term_matrix, id2word=dictionary, passes=50) numberOfModelTopics = len( myTopicModel.get_document_topics(doc_term_matrix)) return numberOfModelTopics
from process import tokanize, clean, getVocabulary, getBOW import json from joblib import load from sklearn.linear_model import LogisticRegression print("write your review:") review = [input()] token = tokanize(review) token = clean(token) #print(token) with open("../data/vocab.json") as vocab_file: vocab = json.load(vocab_file) vector = getBOW(token, vocab) clf = load("../models/BOW_LR_WEBSITE.joblib") #print(vector) prediction = clf.predict(vector) print("your movie score is: " + str(prediction[0]))
import process import time data = 'abdqwbeujqehujqkahwsdjkwqgherjqwabndjkgqukwjehnkqwbadxukjqwghjkkbjdklbqwuikbjklhgeujkdqwbnadsxjkbquwkgedujkqwbsdjkxhqwuikhbedjkgbqwujkasgxjkqwbeukdsgq' iter = 100000 ptr = process.setup(1234) start_time = time.time() for i in range(0, iter): process.set(ptr, data) value = process.get(ptr) print(time.time() - start_time) process.clean(1234, ptr)
import time import process data = 'abdqwbeujqehujqkahwsdjkwqgherjqwabndjkgqukwjehnkqwbadxukjqwghjkkbjdklbqwuikbjklhgeujkdqwbnadsxjkbquwkgedujkqwbsdjkxhqwuikhbedjkgbqwujkasgxjkqwbeukdsgq' iter = 100000 start_time = time.time() for i in range(0, iter): process.set("abcd", data) value = process.get("abcd") print(time.time() - start_time) process.clean("abcd")