Python getDocuments示例，processing.getDocuments Python示例

示例#1

0

显示文件

def loadTexts():
    path = '../texts/math_learning_2letters_full/'
    texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
    for text in texts:
        if '.DS_Store' in text:
            texts.remove(text)
    delimiter = ' '
    return texts, getDocuments(texts, delimiter)

示例#2

0

显示文件

文件： multisite_classification.py 项目： boomsbloom/nlp_fmri

def loadData(paths):
    documents, texts = [{"AD":[],"TD":[]} for _ in range(2)]
    for group in groups:
        for ds in range(len(paths['AD'])):
            currTexts = sorted([os.path.join(paths[group][ds], fn) for fn in os.listdir(paths[group][ds])])
            for text in currTexts:
                if '.DS_Store' in text:
                    currTexts.remove(text)
            texts[group].append(currTexts)
        texts[group] = sum(texts[group], [])
        documents[group] = getDocuments(texts[group], 'none', False, texts[group])

    return texts, documents

示例#3

0

显示文件

文件： unsupervised.py 项目： boomsbloom/nlp_fmri

def runBag(path):

    textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])

    scripts = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
    for script in scripts:
        if '.DS_Store' in script:
            scripts.remove(script)

    documents = getDocuments(scripts, 'none', False, textNames)

    data, newVocab, featureNames = bagOfWords(scripts, documents, True, 0, False, False)

    return data, featureNames

示例#4

0

显示文件

文件： unsupervised.py 项目： boomsbloom/nlp_fmri

def runBag(path):

    textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])

    scripts = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
    for script in scripts:
        if '.DS_Store' in script:
            scripts.remove(script)

    documents = getDocuments(scripts, 'none', False, textNames)

    data, newVocab, featureNames = bagOfWords(scripts, documents, True, 0,
                                              False, False)

    return data, featureNames

示例#5

0

显示文件

文件： multisite_classification.py 项目： boomsbloom/nlp_fmri

def loadData(paths):
    documents, texts = [{"AD": [], "TD": []} for _ in range(2)]
    for group in groups:
        for ds in range(len(paths['AD'])):
            currTexts = sorted([
                os.path.join(paths[group][ds], fn)
                for fn in os.listdir(paths[group][ds])
            ])
            for text in currTexts:
                if '.DS_Store' in text:
                    currTexts.remove(text)
            texts[group].append(currTexts)
        texts[group] = sum(texts[group], [])
        documents[group] = getDocuments(texts[group], 'none', False,
                                        texts[group])

    return texts, documents

示例#6

0

显示文件

    let_acc = [0] * 14  #14
    feat_info = {}  #[[]] * 3

    for let in range(3, 6):  #5 #16 (2,16)
        path = pre_path + '%s_word' % (let)

        textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
        scripts = textNames

        new_texts = []
        for script in scripts:
            if '.DS_Store' not in script:
                new_texts.append(script)

        documents = getDocuments(new_texts, delimiter, isCorpus, textNames)

        data, newVocab, featureNames = bagOfWords(new_texts, documents,
                                                  nGramsinCorpus, mincount,
                                                  windowGrams, gramsOnly)

        rfACC = [0] * nModels
        importances = [[]] * nModels
        top_importances = [[]] * nModels
        top_stds = [[]] * nModels
        top_featNames = [[]] * nModels
        stds = [[]] * nModels

        for i in range(nModels):
            print "Running RF with %i estimators...\n" % (nEstimators)
            rfACC[i], importances[i], stds[i] = rfModel(

示例#7

0

显示文件

# for classification
nLabelOne = 40  #70#90#40#70#30 #number of TDs
nLabelTwo = 40  #70#90#40#70#30 #number of ADs
labels = np.asarray([0] * nLabelOne + [1] * nLabelTwo)
nFolds = len(labels)  #leave-one-out
nEstimators = 1000  #1000 #number of estimators for random forest classifier

runClassification = True  # run classification on topic probabilities
#runClassification = False # OPTION ONLY FOR HDP ...run classification on document similarities

################################################
############## PROCESS DATA ####################
################################################

# Create dictionary with list of processed words for each document key
documents = getDocuments(scripts, delimiter, isCorpus, textNames)
#plot_Networkletters(documents, title, letters)

if network_wise:

    new_docs = {}
    for doc in documents:
        new_docs[doc] = []
        for word in documents[doc]:
            #documents[doc][documents[doc].index(word)] = 'DMN_' + word[0] + ' SN_' + word[1] + ' LECN_' + word[2] + ' RECN_' + word[3]
            new_docs[doc].append(' dmn_' + word[0])
            new_docs[doc].append(' sn_' + word[1])
            new_docs[doc].append(' lecn_' + word[2])
            new_docs[doc].append(' recn_' + word[3])

    documents = new_docs

示例#8

0

显示文件

文件： tensorflow_models.py 项目： boomsbloom/nlp_fmri


AD_train = []
TD_train = []
AD_test = []
TD_test = []
for path in paths:

    textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])

    texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
    for text in texts:  #removing .DS_Store
        if '.DS_Store' in text:
            texts.remove(text)

    docs = getDocuments(texts, 'none', False, textNames)
    doc_list = listsfromDict(docs)

    bigram_docs = []
    for doc in doc_list:
        bigramList = []
        for item in ngrams(doc, 2):
            bigramList.append("_".join(item))
        bigram_docs.append(bigramList)
    doc_list = bigram_docs

    if 'AD_' in path:
        if path in train_path:
            AD_train.append(doc_list)
        elif path in test_path:
            AD_test.append(doc_list)

示例#9

0

显示文件

文件： bigram_stats.py 项目： boomsbloom/nlp_fmri

#trigrams
# ad_voc = loadCSV('NYU_AD_2letter_trigrams_BoW')
# td_voc = loadCSV('NYU_TD_2letter_trigrams_BoW')
# ad_voc = ad_voc[1]
# td_voc = td_voc[1]
# vocab = [w for w in td_voc if w in set(ad_voc)]
# print vocab

vocab_dict = {"AD":dict((word,[]) for word in vocab),"TD":dict((word,[]) for word in vocab)}

for group in groups:
    texts[group] = sorted([os.path.join(paths[group], fn) for fn in os.listdir(paths[group])])
    if len(texts[group]) > 80:
       texts[group] = texts[group][1:len(texts[group])]

    documents[group] = getDocuments(texts[group], 'none', False, texts[group])

#### BAG OF WORDS ####
    for text in texts[group]:
        if '.DS_Store' not in text:
            doc_dict = {text:documents[group][text]} #making single dict to get BoW code to work on individual text
            counts[group][text], reducedDocuments, words[group][text] = bagOfWords([text], doc_dict, True, 0, False, False)
            counts[group][text] = counts[group][text][0] #flattening
            count_list = counts[group][text]
            word_list = words[group][text] # for code simplification

            no_occurrence = [w for w in vocab if w not in set(word_list)]
            occurrence = [w for w in vocab if w in set(word_list)]

            for word in occurrence:
                count_ind = word_list.index(word)

示例#10

0

显示文件

文件： tensorflow_models.py 项目： boomsbloom/nlp_fmri

    return newList

AD_train = []
TD_train = []
AD_test = []
TD_test = []
for path in paths:

    textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])

    texts = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
    for text in texts: #removing .DS_Store
        if '.DS_Store' in text:
            texts.remove(text)

    docs = getDocuments(texts,'none',False,textNames)
    doc_list = listsfromDict(docs)

    bigram_docs = []
    for doc in doc_list:
        bigramList = []
        for item in ngrams(doc,2):
            bigramList.append("_".join(item))
        bigram_docs.append(bigramList)
    doc_list = bigram_docs

    if 'AD_' in path:
        if path in train_path:
            AD_train.append(doc_list)
        elif path in test_path:
            AD_test.append(doc_list)

示例#11

0

显示文件

文件： network_stats.py 项目： boomsbloom/nlp_fmri

'''

#### DEFINE GROUPS AND GROUP VARIABLES ####
groups = ["AD","TD"]
paths = {"AD":'texts/ADHD_various_half/AD_4',"TD":'texts/ADHD_various_half/TD_4'}
documents, texts = [{"AD":[],"TD":[]} for _ in range(2)]
counts, words, DMN_inverse, tasknet_activation = [{"AD":{},"TD":{}} for _ in range(4)]
networks = ["DMN","SN","LECN","RECN"]

#### PROCESS DOCUMENTS ####
for group in groups:
    texts[group] = sorted([os.path.join(paths[group], fn) for fn in os.listdir(paths[group])])
    if len(texts[group]) > 80:
       texts[group] = texts[group][1:len(texts[group])]

    documents[group] = getDocuments(texts[group], 'none', False, texts[group])

#### BAG OF WORDS ####
    for text in texts[group]:
        DMN_inverse[group][text]= {"SN":0,"LECN":0,"RECN":0}
        tasknet_activation[group][text] = {"SN_RECN":0,"SN_LECN":0,"LECN_RECN":0,"SN_LECN_RECN":0}
        doc_dict = {text:documents[group][text]} #making single dict to get BoW code to work on individual text
        counts[group][text], reducedDocuments, words[group][text] = bagOfWords([text], doc_dict, False, 0, False, False)
        counts[group][text] = counts[group][text][0] #flattening
        count_list = counts[group][text]
        word_list = words[group][text] # for code simplification

        #### SORT BY ACTIVITY ####

        for word in word_list:
            # inverse activity with DMN

示例#12

0

显示文件

文件： looping_forest.py 项目： boomsbloom/nlp_fmri

    let_acc = [0] * 14#14
    feat_info = {}#[[]] * 3

    for let in range(3,6): #5 #16 (2,16)
        path = pre_path + '%s_word'%(let)

        textNames = sorted([os.path.join(path, fn) for fn in os.listdir(path)])
        scripts = textNames

        new_texts = []
        for script in scripts:
            if '.DS_Store' not in script:
                new_texts.append(script)

        documents = getDocuments(new_texts, delimiter, isCorpus, textNames)

        data, newVocab, featureNames = bagOfWords(new_texts, documents, nGramsinCorpus, mincount, windowGrams, gramsOnly)

        rfACC = [0] * nModels
        importances = [[]] * nModels
        top_importances = [[]] * nModels
        top_stds = [[]] * nModels
        top_featNames = [[]] * nModels
        stds = [[]] * nModels

        for i in range(nModels):
           print "Running RF with %i estimators...\n" %(nEstimators)
           rfACC[i], importances[i], stds[i] = rfModel(data, labels, nFolds, nEstimators)
           idx = (-importances[i]).argsort()[:10]

示例#13

0

显示文件

文件： main.py 项目： boomsbloom/nlp_fmri

# for classification
nLabelOne = 40#70#90#40#70#30 #number of TDs
nLabelTwo = 40#70#90#40#70#30 #number of ADs
labels  = np.asarray([0] * nLabelOne + [1] * nLabelTwo)
nFolds = len(labels) #leave-one-out
nEstimators = 1000 #1000 #number of estimators for random forest classifier

runClassification = True # run classification on topic probabilities
#runClassification = False # OPTION ONLY FOR HDP ...run classification on document similarities

################################################
############## PROCESS DATA ####################
################################################

# Create dictionary with list of processed words for each document key
documents = getDocuments(scripts, delimiter, isCorpus, textNames)
#plot_Networkletters(documents, title, letters)

if network_wise:

    new_docs = {}
    for doc in documents:
        new_docs[doc] = []
        for word in documents[doc]:
            #documents[doc][documents[doc].index(word)] = 'DMN_' + word[0] + ' SN_' + word[1] + ' LECN_' + word[2] + ' RECN_' + word[3]
            new_docs[doc].append(' dmn_' + word[0])
            new_docs[doc].append(' sn_' + word[1])
            new_docs[doc].append(' lecn_' + word[2])
            new_docs[doc].append(' recn_' + word[3])

    documents = new_docs