def loadTexts(): path = '../texts/math_learning_2letters_full/' texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) for text in texts: if '.DS_Store' in text: texts.remove(text) delimiter = ' ' return texts, getDocuments(texts, delimiter)
def loadData(paths): documents, texts = [{"AD":[],"TD":[]} for _ in range(2)] for group in groups: for ds in range(len(paths['AD'])): currTexts = sorted([os.path.join(paths[group][ds], fn) for fn in os.listdir(paths[group][ds])]) for text in currTexts: if '.DS_Store' in text: currTexts.remove(text) texts[group].append(currTexts) texts[group] = sum(texts[group], []) documents[group] = getDocuments(texts[group], 'none', False, texts[group]) return texts, documents
def runBag(path): textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) scripts = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) for script in scripts: if '.DS_Store' in script: scripts.remove(script) documents = getDocuments(scripts, 'none', False, textNames) data, newVocab, featureNames = bagOfWords(scripts, documents, True, 0, False, False) return data, featureNames
def loadData(paths): documents, texts = [{"AD": [], "TD": []} for _ in range(2)] for group in groups: for ds in range(len(paths['AD'])): currTexts = sorted([ os.path.join(paths[group][ds], fn) for fn in os.listdir(paths[group][ds]) ]) for text in currTexts: if '.DS_Store' in text: currTexts.remove(text) texts[group].append(currTexts) texts[group] = sum(texts[group], []) documents[group] = getDocuments(texts[group], 'none', False, texts[group]) return texts, documents
let_acc = [0] * 14 #14 feat_info = {} #[[]] * 3 for let in range(3, 6): #5 #16 (2,16) path = pre_path + '%s_word' % (let) textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) scripts = textNames new_texts = [] for script in scripts: if '.DS_Store' not in script: new_texts.append(script) documents = getDocuments(new_texts, delimiter, isCorpus, textNames) data, newVocab, featureNames = bagOfWords(new_texts, documents, nGramsinCorpus, mincount, windowGrams, gramsOnly) rfACC = [0] * nModels importances = [[]] * nModels top_importances = [[]] * nModels top_stds = [[]] * nModels top_featNames = [[]] * nModels stds = [[]] * nModels for i in range(nModels): print "Running RF with %i estimators...\n" % (nEstimators) rfACC[i], importances[i], stds[i] = rfModel(
# for classification nLabelOne = 40 #70#90#40#70#30 #number of TDs nLabelTwo = 40 #70#90#40#70#30 #number of ADs labels = np.asarray([0] * nLabelOne + [1] * nLabelTwo) nFolds = len(labels) #leave-one-out nEstimators = 1000 #1000 #number of estimators for random forest classifier runClassification = True # run classification on topic probabilities #runClassification = False # OPTION ONLY FOR HDP ...run classification on document similarities ################################################ ############## PROCESS DATA #################### ################################################ # Create dictionary with list of processed words for each document key documents = getDocuments(scripts, delimiter, isCorpus, textNames) #plot_Networkletters(documents, title, letters) if network_wise: new_docs = {} for doc in documents: new_docs[doc] = [] for word in documents[doc]: #documents[doc][documents[doc].index(word)] = 'DMN_' + word[0] + ' SN_' + word[1] + ' LECN_' + word[2] + ' RECN_' + word[3] new_docs[doc].append(' dmn_' + word[0]) new_docs[doc].append(' sn_' + word[1]) new_docs[doc].append(' lecn_' + word[2]) new_docs[doc].append(' recn_' + word[3]) documents = new_docs
AD_train = [] TD_train = [] AD_test = [] TD_test = [] for path in paths: textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) for text in texts: #removing .DS_Store if '.DS_Store' in text: texts.remove(text) docs = getDocuments(texts, 'none', False, textNames) doc_list = listsfromDict(docs) bigram_docs = [] for doc in doc_list: bigramList = [] for item in ngrams(doc, 2): bigramList.append("_".join(item)) bigram_docs.append(bigramList) doc_list = bigram_docs if 'AD_' in path: if path in train_path: AD_train.append(doc_list) elif path in test_path: AD_test.append(doc_list)
#trigrams # ad_voc = loadCSV('NYU_AD_2letter_trigrams_BoW') # td_voc = loadCSV('NYU_TD_2letter_trigrams_BoW') # ad_voc = ad_voc[1] # td_voc = td_voc[1] # vocab = [w for w in td_voc if w in set(ad_voc)] # print vocab vocab_dict = {"AD":dict((word,[]) for word in vocab),"TD":dict((word,[]) for word in vocab)} for group in groups: texts[group] = sorted([os.path.join(paths[group], fn) for fn in os.listdir(paths[group])]) if len(texts[group]) > 80: texts[group] = texts[group][1:len(texts[group])] documents[group] = getDocuments(texts[group], 'none', False, texts[group]) #### BAG OF WORDS #### for text in texts[group]: if '.DS_Store' not in text: doc_dict = {text:documents[group][text]} #making single dict to get BoW code to work on individual text counts[group][text], reducedDocuments, words[group][text] = bagOfWords([text], doc_dict, True, 0, False, False) counts[group][text] = counts[group][text][0] #flattening count_list = counts[group][text] word_list = words[group][text] # for code simplification no_occurrence = [w for w in vocab if w not in set(word_list)] occurrence = [w for w in vocab if w in set(word_list)] for word in occurrence: count_ind = word_list.index(word)
return newList AD_train = [] TD_train = [] AD_test = [] TD_test = [] for path in paths: textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) for text in texts: #removing .DS_Store if '.DS_Store' in text: texts.remove(text) docs = getDocuments(texts,'none',False,textNames) doc_list = listsfromDict(docs) bigram_docs = [] for doc in doc_list: bigramList = [] for item in ngrams(doc,2): bigramList.append("_".join(item)) bigram_docs.append(bigramList) doc_list = bigram_docs if 'AD_' in path: if path in train_path: AD_train.append(doc_list) elif path in test_path: AD_test.append(doc_list)
''' #### DEFINE GROUPS AND GROUP VARIABLES #### groups = ["AD","TD"] paths = {"AD":'texts/ADHD_various_half/AD_4',"TD":'texts/ADHD_various_half/TD_4'} documents, texts = [{"AD":[],"TD":[]} for _ in range(2)] counts, words, DMN_inverse, tasknet_activation = [{"AD":{},"TD":{}} for _ in range(4)] networks = ["DMN","SN","LECN","RECN"] #### PROCESS DOCUMENTS #### for group in groups: texts[group] = sorted([os.path.join(paths[group], fn) for fn in os.listdir(paths[group])]) if len(texts[group]) > 80: texts[group] = texts[group][1:len(texts[group])] documents[group] = getDocuments(texts[group], 'none', False, texts[group]) #### BAG OF WORDS #### for text in texts[group]: DMN_inverse[group][text]= {"SN":0,"LECN":0,"RECN":0} tasknet_activation[group][text] = {"SN_RECN":0,"SN_LECN":0,"LECN_RECN":0,"SN_LECN_RECN":0} doc_dict = {text:documents[group][text]} #making single dict to get BoW code to work on individual text counts[group][text], reducedDocuments, words[group][text] = bagOfWords([text], doc_dict, False, 0, False, False) counts[group][text] = counts[group][text][0] #flattening count_list = counts[group][text] word_list = words[group][text] # for code simplification #### SORT BY ACTIVITY #### for word in word_list: # inverse activity with DMN
let_acc = [0] * 14#14 feat_info = {}#[[]] * 3 for let in range(3,6): #5 #16 (2,16) path = pre_path + '%s_word'%(let) textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)]) scripts = textNames new_texts = [] for script in scripts: if '.DS_Store' not in script: new_texts.append(script) documents = getDocuments(new_texts, delimiter, isCorpus, textNames) data, newVocab, featureNames = bagOfWords(new_texts, documents, nGramsinCorpus, mincount, windowGrams, gramsOnly) rfACC = [0] * nModels importances = [[]] * nModels top_importances = [[]] * nModels top_stds = [[]] * nModels top_featNames = [[]] * nModels stds = [[]] * nModels for i in range(nModels): print "Running RF with %i estimators...\n" %(nEstimators) rfACC[i], importances[i], stds[i] = rfModel(data, labels, nFolds, nEstimators) idx = (-importances[i]).argsort()[:10]
# for classification nLabelOne = 40#70#90#40#70#30 #number of TDs nLabelTwo = 40#70#90#40#70#30 #number of ADs labels = np.asarray([0] * nLabelOne + [1] * nLabelTwo) nFolds = len(labels) #leave-one-out nEstimators = 1000 #1000 #number of estimators for random forest classifier runClassification = True # run classification on topic probabilities #runClassification = False # OPTION ONLY FOR HDP ...run classification on document similarities ################################################ ############## PROCESS DATA #################### ################################################ # Create dictionary with list of processed words for each document key documents = getDocuments(scripts, delimiter, isCorpus, textNames) #plot_Networkletters(documents, title, letters) if network_wise: new_docs = {} for doc in documents: new_docs[doc] = [] for word in documents[doc]: #documents[doc][documents[doc].index(word)] = 'DMN_' + word[0] + ' SN_' + word[1] + ' LECN_' + word[2] + ' RECN_' + word[3] new_docs[doc].append(' dmn_' + word[0]) new_docs[doc].append(' sn_' + word[1]) new_docs[doc].append(' lecn_' + word[2]) new_docs[doc].append(' recn_' + word[3]) documents = new_docs