예제 #1
0
            listDocOrg = fTxtOrg.readlines()
        print 'len(listDocOrg): ', len(listDocOrg)

        for rowOfListDocOrg in listDocOrg:
    #                print 'rowOfListDocOrg: ', rowOfListDocOrg
    #            myResult = p.search(rowOfListDocOrg)
    #            if myResult <> None:
    #                myData = re.sub('^Title: |^Abstract: ','',myResult.group())
    #                outf.write(myData)
            outf.write(rowOfListDocOrg)
            listMyWords.extend(rowOfListDocOrg.split())
    #                listDoc.append((myData.split(),fileOne[9:11]))
            print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split())
            listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff))
    idxCrossValidation = 0
    for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, randomize = True):
#        outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per'
        outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)

        with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.txt', 'wb') as outf2:
            for oneRowOfListTrainWithDiff in listTrainWithDiff:
#                listAllDocWords.extend(oneRowOfListTrainWithDiff[0])
                outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n')

        with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.txt', 'wb') as outf3:
            for oneRowOflistValidationWithDiff in listValidationWithDiff:
                print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff
                outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n')

        idxCrossValidation = idxCrossValidation + 1 
예제 #2
0
def fCreadeCrossValidationFiles(numFold):

    
#    numFold = 10
    
    listMyType = ['stp-', 'wnl-', 'ptr-']
    #typeTextPreprocess = 'stp-'
    typeTextPreprocess = 'wnl-'
    #typeTextPreprocess = 'ptr-'
    
    myRe = '((^Title: |^Abstract: )(.*))'
    p = re.compile(myRe)



#    dirMain = ''
#    dirMain = os.path.expanduser('~')+'/' # '/home/kimiko'
    dirMain = os.path.expanduser('~')+'/' + 'Data/TestDir/' # '/home/kimiko'    
    logging.info("dirMain = os.path.expanduser('~')+'/': " + dirMain)

    dirInput = 'Output1/'
    dirOutputTest = 'Output2_TestingSet/'
    logging.info("dirOutputTest: " + dirOutputTest)
    
    dirOutputMergeFile = 'Output2_Merge/'
    dirOutputTrain = 'Output2_TrainingSet/'
    
    
    #filesInput = ['pure-doc-dx.txt', 'pure-doc-tx.txt']
    #filesInput = ['intervention.txt', 'patient.txt', 'outcome.txt']
    ListInputFilenameTxt = []
#    ListInputFilenameTxt = ['intervention.txt', 'patient.txt', 'outcome.txt']
    ListInputFilenameTxtTmp = os.listdir(dirMain + dirInput)
    for itemOfListInputFilenameTxtTmp in ListInputFilenameTxtTmp:
        statinfo = os.stat(dirMain + dirInput + itemOfListInputFilenameTxtTmp)
        if statinfo.st_size > numFold*1500:
            ListInputFilenameTxt.append(itemOfListInputFilenameTxtTmp)
        else:
            os.remove(dirMain + dirInput + itemOfListInputFilenameTxtTmp)
        
#    ListInputFilenameTxt = os.listdir(dirMain + dirInput)
    
#    print "Unique Combinations of 2 letters from :",ListInputFil?enameTxt
    logging.info("Unique Combinations of 2 letters from: " + ', '.join(ListInputFilenameTxt))
#    exit()
    #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc)
    
    
    
    #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #                  ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #                  ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    #                  ]
    #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #filesInput = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    
    
    
    #for typeTextPreprocess in listMyType:
#    dirMain = os.getcwd()+'/'
    logging.info("dirMain + dirOutputTest: " + dirMain + dirOutputTest)
    if os.path.isdir(dirMain + dirOutputTest):
        try:
            shutil.rmtree(dirMain+dirOutputTest)
#            os.mkdir(dirMain + dirOutputTest)
        except:
            raise
    os.mkdir(dirMain + dirOutputTest)
    
    logging.info("dirMain + dirOutputMergeFile: " + dirMain + dirOutputMergeFile)
    if os.path.isdir(dirMain + dirOutputMergeFile):
        try:
            shutil.rmtree(dirMain+dirOutputMergeFile)
        except:
            raise
    os.mkdir(dirMain + dirOutputMergeFile)
    
    logging.info("dirMain + dirOutputTrain: " + dirMain + dirOutputTrain)
    if os.path.isdir(dirMain + dirOutputTrain):
        try:
            shutil.rmtree(dirMain+dirOutputTrain)
        except:
            raise
    os.mkdir(dirMain + dirOutputTrain)
#    exit()
    
    for fileOne in ListInputFilenameTxt:
#        outputFileNameDiff = fileOne[0:3]
        outputFileNameDiff = fileOne[0:-4]
#        print 'outputFileNameDiff: ', outputFileNameDiff
        logging.info('outputFileNameDiff: '+ outputFileNameDiff)
    
        listMyWords = []
        listDoc = []
        
        logging.info(dirMain+dirOutputTest+typeTextPreprocess+outputFileNameDiff+'.csv')
        with open(dirMain+dirOutputTest+typeTextPreprocess+outputFileNameDiff+'.csv', 'wb') as outf:
#            filePioTxt= dirMain+dirInput+typeTextPreprocess+fileOne
            filePioTxt= dirMain+dirInput + fileOne
            with open(filePioTxt) as fTxtOrg:
                listDocOrg = fTxtOrg.readlines()
#            print 'len(listDocOrg): ', len(listDocOrg)
            logging.info('len(listDocOrg): '+ str(len(listDocOrg)))
    
            for rowOfListDocOrg in listDocOrg:
        #                print 'rowOfListDocOrg: ', rowOfListDocOrg
        #            myResult = p.search(rowOfListDocOrg)
        #            if myResult <> None:
        #                myData = re.sub('^Title: |^Abstract: ','',myResult.group())
        #                outf.write(myData)
                outf.write(rowOfListDocOrg)
                listMyWords.extend(rowOfListDocOrg.split())
        #                listDoc.append((myData.split(),fileOne[9:11]))
#                print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split())
                logging.debug('(rowOfListDocOrg.split(),outputFileNameDiff): '+ outputFileNameDiff + " - "+ str(rowOfListDocOrg.split()))
                listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff))
#            exit()
        idxCrossValidation = 0
        # def k_fold_cross_validation(X, K, randomise = False):
#        for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, randomize = True):
        for listTrainWithDiff, listValidationWithDiff in k_fold_cross_validation(listDoc, numFold, True):
    #        outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per'
            outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)
            with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Train-'+'.csv', 'a') as outfFullTrain:
                with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Test-'+'.csv', 'a') as outfFullTest:
            
                    with open(dirMain+dirOutputTrain+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.csv', 'wb') as outf2:
                        for oneRowOfListTrainWithDiff in listTrainWithDiff:
            #                listAllDocWords.extend(oneRowOfListTrainWithDiff[0])
                            outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n')
                            outfFullTrain.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n')
            
                    with open(dirMain+dirOutputTest+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.csv', 'wb') as outf3:
                        for oneRowOflistValidationWithDiff in listValidationWithDiff:
#                            print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff
#                            print 'type(oneRowOflistValidationWithDiff): ', type(oneRowOflistValidationWithDiff)
#                            exit()
                            logging.debug('oneRowOflistValidationWithDiff: ' + str(oneRowOflistValidationWithDiff))
                            outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n')
                            outfFullTest.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n')
            
                    idxCrossValidation = idxCrossValidation + 1 
예제 #3
0
es_entrenamiento = numpy.asarray(es_entrenamiento)

archivo = numpy.concatenate((datos,es_entrenamiento[numpy.newaxis, :].T), axis=1)


#Verificamos el tipo de CV solicitado y ejecutamos el CV correspondiente,
#CV hay q ejecutarlo 3 veces, uno para predicciones de 1R, otro para predicciones 2R y otro para 2R_1R

#Se hace kfold
if options.kf == True:
      
      validation_k = int(options.kfolds)

      
      #Se aplica cv para predecir 1r
      respuestas, fold_error_t, fold_error_v, final_error_t, final_error_v = crossValidation.k_fold_cross_validation(validation_k, test_percentage, datos1r, options, "1r")
      print("Ronda 1")
      print("FoldAccuracyT", 100 - fold_error_t)
      print("FoldAccuracyV", 100 -fold_error_v)
      print("FinalAccuracyT",100 - final_error_t)
      print("FinalAccuracyV",100 - final_error_v)
      print("--------------------------------------")
      respuestas = numpy.asarray(respuestas)

      archivo = numpy.concatenate((archivo,respuestas[numpy.newaxis, :].T), axis=1)
      #Se aplica cv para predecir 2r
      respuestas, fold_error_t, fold_error_v, final_error_t, final_error_v = crossValidation.k_fold_cross_validation(validation_k, test_percentage, datos2r, options, "2r")
      print("Ronda 2")
      print("FoldAccuracyT", 100 - fold_error_t)
      print("FoldAccuracyV", 100 - fold_error_v)
      print("FinalAccuracyT",100 - final_error_t)