print ('\n\nPlease uncomment nltk.download() to download text data sets \n') #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean symptoms and summary clean_train_symptom = [] clean_train_summary ={} print ("Cleaning and parsing the training set symptoms...\n") for i in xrange( 0, len(train["symptom"])): clean_train_symptom.append(" ".join(Word2VecUtility.symptoms_to_wordlist(train["symptom"][i],True))) print ("Cleaning and parsing the training set summary...\n") for i in xrange( 0, len(train["summary"])): clean_train_summary[train["disease"][i]] = "".join(Word2VecUtility.summary_to_wordlist(train["summary"][i])) # ****** Create a bag of words from the training set # print ("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = None)