def test_source_headers_correct(self):
        print 'testing source_headers_correct'
        tsvfile1 = self.framework.tsvtest1
        tsvfile2 = self.framework.tsvtest2
        csvfile1 = self.framework.csvtest1
        csvfile2 = self.framework.csvtest2

        header = read_header(tsvfile1)
        modelheader = []
        modelheader.append('materialSampleID')
        modelheader.append('principalInvestigator')
        modelheader.append('locality')
        modelheader.append('phylum')
        modelheader.append('')
        #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 5,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')

        header = read_header(tsvfile2)
        modelheader = []
        modelheader.append('materialSampleID')
        modelheader.append('principalInvestigator')
        modelheader.append('locality')
        modelheader.append('phylum')
        modelheader.append('decimalLatitude')
        modelheader.append('decimalLongitude')
        #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 6,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')

        header = read_header(csvfile2)
        modelheader = []
        modelheader.append('materialSampleID')
        modelheader.append('principalInvestigator')
        modelheader.append('locality')
        modelheader.append('phylum')
        modelheader.append('decimalLatitude')
        modelheader.append('decimalLongitude')
        #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 6,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')

        header = read_header(csvfile1)
        modelheader = []
        modelheader.append('materialSampleID')
        modelheader.append('principalInvestigator')
        modelheader.append('locality')
        modelheader.append('phylum')
        modelheader.append('')
        #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 5,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')
Пример #2
0
    def test_vocab_appender(self):
        print 'testing vocab_appender'
        testvocabfile = self.framework.testvocabfile

        geogkey = compose_key_from_list(geogkeytermlist)

        g1 = 'Oceania|United States|US|Hawaii|Honolulu|Honolulu|'
        g1 += 'North Pacific Ocean|Hawaiian Islands|Oahu'
        g2 = '|United States||WA|Chelan Co.||||'

        n = [g1, g2]

        inputs = {}
        inputs['vocabfile'] = testvocabfile
        inputs['key'] = geogkey
        inputs['checkvaluelist'] = n
        # print 'inputs:\n%s' % inputs

        # Add new vocab to new vocab file
        response = vocab_appender(inputs)
        # print 'response1:\n%s' % response

        writtenlist = response['addedvalues']
        #print 'writtenlist1: %s' % writtenlist
        self.assertEqual(writtenlist, n,
                         'values not written to new testvocabfile')

        header = read_header(testvocabfile)
        #print 'vocab file header:\n%s' % header

        # Attempt to add same vocabs to the same vocabs file
        response = vocab_appender(inputs)
        #print 'response2:\n%s' % response

        writtenlist = response['addedvalues']
        #print 'writtenlist2: %s' % writtenlist
        self.assertIsNone(writtenlist,
                          'duplicate value written to testvocabfile')

        header = read_header(testvocabfile)
        #print 'vocab file header:\n%s' % header
        self.assertEquals(header[0], geogkey,
                          'key field not correct in testvocabfile')
Пример #3
0
    def test_term_exists(self):
        print 'testing term_exists'
        testfile = self.framework.testfile1

        header = read_header(testfile)
        term = 'year'
        present = term in header
        s = 'test file %s does not contain "%s" field' % (testfile, term)
        self.assertTrue(present, s)

        term = 'fieldNumber '
        present = term in header
        s = 'test file %s does not contain "%s" field' % (testfile, term)
        self.assertTrue(present, s)

        testfile = self.framework.testfile2
        header = read_header(testfile)
        term = 'month'
        present = term in header
        s = 'test file %s does not contain "%s" field' % (testfile, term)
        self.assertTrue(present, s)
    def test_text_file_filter(self):
        print 'testing text_file_filter'
        testinputfile = self.framework.testinputfile
        testreportfile = self.framework.testreportfile
        workspace = self.framework.testdatapath
        outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile)
        termname = 'year'
        matchingvalue = '1990'

        inputs = {}
        inputs['inputfile'] = testinputfile
        inputs['termname'] = termname
        inputs['matchingvalue'] = matchingvalue
        inputs['workspace'] = workspace
        inputs['outputfile'] = testreportfile

        # Create the report
        #print 'inputs:\n%s' % inputs
        response = text_file_filter(inputs)
        #print 'response:\n%s' % response
        success = response['success']
        s = 'text file filter failed: %s' % response['message']
        self.assertTrue(success, s)

        outputfile = response['outputfile']
        #print 'response:\n%s' % response
        s = 'Output file %s not created' % outputfile
        self.assertTrue(os.path.isfile(outputfile), s)

        header = read_header(outputfile)
        dialect = csv_file_dialect(outputfile)
        encoding = csv_file_encoding(outputfile)

        matches = 0
        # Iterate through all rows in the input file
        for row in read_csv_row(outputfile,
                                dialect=dialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=header):
            #print 'row: %s' % row
            if row[termname] == matchingvalue:
                matches += 1
        expected = 5
        s = 'Number of matches in output (%s) not as expected (%s)' % (
            matches, expected)
        self.assertEqual(matches, expected, s)

        matches = count_rows(outputfile)
        expected = 7
        s = 'Number of matches of %s in %s ' % (matchingvalue, outputfile)
        s += 'was %s, not as expected (%s) ' % (matches, expected)
        self.assertEqual(matches, expected, s)
 def test_writevocabheader(self):
     print 'testing writevocabheader'
     writevocabheadertestfile = self.framework.writevocabheadertestfile
     fieldnames = ['country|stateprovince|county', 'standard', 'vetted', 'error', 
         'misplaced', 'unresolved', 'source', 'comment']
     dialect = vocab_dialect()
     success = writevocabheader(writevocabheadertestfile, fieldnames, dialect)
     self.assertTrue(success,'vocab header not written')
     
     header = read_header(writevocabheadertestfile)
     expected = ['country|stateprovince|county', 'standard', 'vetted', 'error', 
         'misplaced', 'unresolved', 'source', 'comment']
     s = 'header:\n%s\nfrom file: %s\nnot as expected:\n%s' \
         % (header,writevocabheadertestfile,expected)
     self.assertEqual(header, expected, s)
    def test_read_vocab_header(self):
        print 'testing read_vocab_header'
        vocabdialect = vocab_dialect()
        vocabencoding = 'utf-8'
        
        monthvocabfile = self.framework.monthvocabfile
        header = read_header(monthvocabfile, vocabdialect, vocabencoding)
        found = len(header)
        expected = 3
        s = 'Found %s fields in header. Expected %s' % (found, expected)
        self.assertEqual(found, expected, s)

        expected = ['month'] + vocabfieldlist
        s = 'File: %s\nheader: %s\n' % (monthvocabfile, header)
        s += 'not as expected: %s' % expected
        self.assertEqual(header, expected, s)
 def test_vocab_headers_correct(self):
     print 'testing vocab_headers_correct'
     vocabpath = self.framework.vocabpath
     vocabdialect = vocab_dialect()
     vocabencoding = 'utf-8'
     
     for field in controlledtermlist:
         vocabfile = vocabpath + field + '.txt'
         if not os.path.isfile(vocabfile):
             success = writevocabheader(vocabfile, vocabfieldlist, vocabdialect, 
                 vocabencoding)
         header = read_header(vocabfile, vocabdialect, vocabencoding)
         expected = [field.lower()] + vocabfieldlist
         s = 'File: %s\nheader: %s\n' % (vocabfile, header)
         s += 'not as expected: %s' % expected
         self.assertEqual(header, expected, s)
    def test_term_standardizer_report(self):
        print 'testing term_standardizer_report'
        testcorrectioninputfile = self.framework.testcorrectioninputfile
        testcorrectionreportfile = self.framework.testcorrectionreportfile
        testmonthvocabfile = self.framework.testmonthvocabfile

        key = 'month'
        result = term_standardizer_report(testcorrectioninputfile, \
            testcorrectionreportfile, testmonthvocabfile, key)
        s = 'term_standardizer_report() result not True '
        s += 'with inputfile: %s ' % testcorrectioninputfile
        s += 'outpufile: %s' % testcorrectionreportfile
        s += 'and vocabfile: %s' % testmonthvocabfile
        self.assertTrue(result, s)

        outputheader = read_header(testcorrectionreportfile)
        expected = ['ID', 'month', 'country', 'month_orig']
        s = 'outputheader: %s not as expected: %s' % (outputheader, expected)
        self.assertEqual(outputheader, expected, s)

        dialect = csv_file_dialect(testcorrectionreportfile)
        encoding = csv_file_encoding(testcorrectionreportfile)
        rows = read_rows(testcorrectionreportfile,
                         1,
                         dialect=dialect,
                         encoding=encoding,
                         header=True,
                         fieldnames=outputheader)
        firstrow = rows[0]

        field = 'month_orig'
        value = firstrow[field]
        expected = 'vi'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)

        field = 'month'
        value = firstrow[field]
        expected = '6'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)
Пример #9
0
    def test_text_file_field_stripper(self):
        print 'testing text_file_field_stripper'
        testinputfile = self.framework.testinputfile
        testreportfile = self.framework.testreportfile
        workspace = self.framework.testdatapath
        outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile)
        termlist = 'country|stateProvince'

        inputs = {}
        inputs['inputfile'] = testinputfile
        inputs['termlist'] = termlist
        inputs['workspace'] = workspace
        inputs['outputfile'] = testreportfile
        inputs['separator'] = '|'

        # Create the report
        #print 'inputs:\n%s' % inputs
        response = text_file_field_stripper(inputs)
        #print 'response:\n%s' % response
        success = response['success']
        s = 'text file filter failed: %s' % response['message']
        self.assertTrue(success, s)

        outputfile = response['outputfile']
        #print 'response:\n%s' % response
        s = 'Output file %s not created' % outputfile
        self.assertTrue(os.path.isfile(outputfile), s)

        header = read_header(outputfile)
        dialect = csv_file_dialect(outputfile)
        encoding = csv_file_encoding(outputfile)

        rows = count_rows(outputfile)
        expected = 10
        s = 'Number of rows in %s ' % outputfile
        s += 'was %s, not as expected (%s) ' % (rows, expected)
        self.assertEqual(rows, expected, s)

        expected = ['country', 'stateprovince']
        s = 'Header: %s, not as expected: %s' % (header, expected)
        self.assertEqual(header, expected, s)
    def test_aggregate_tsvs(self):
        print 'testing aggregate_tsvs'
        tsvfile = self.framework.tsvfile
        tsvcompositepath = self.framework.tsvcompositepath
        workspace = self.framework.testdatapath

        inputs = {}
        inputs['inputpath'] = tsvcompositepath
        inputs['outputfile'] = tsvfile
        inputs['inputdialect'] = 'tsv'
        inputs['workspace'] = workspace

        #print 'inputs:\n%s' % (inputs)

        # Aggregate text file
        response = text_file_aggregator(inputs)

        #print 'inputs:\n%s\nresponse:\n%s' % (inputs, response)
        outputfile = response['outputfile']
        self.assertTrue(os.path.isfile(outputfile),
                        outputfile + ' does not exist')
        self.assertEqual(response['aggregaterowcount'], 6,
                         'incorrect number of rows')

        header = read_header(outputfile)
        modelheader = []
        modelheader.append('decimalLatitude')
        modelheader.append('decimalLongitude')
        modelheader.append('locality')
        modelheader.append('materialSampleID')
        modelheader.append('phylum')
        modelheader.append('principalInvestigator')
        #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 6,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')
Пример #11
0
    def test_term_completeness_reporter(self):
        print 'testing term_completeness_reporter'
        testinputfile = self.framework.testinputfile
        testreportfile = self.framework.testreportfile
        workspace = self.framework.testdatapath
        outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile)

        inputs = {}
        inputs['inputfile'] = testinputfile
        inputs['workspace'] = workspace
        inputs['outputfile'] = testreportfile

        # Create the report
        #print 'inputs:\n%s' % inputs
        response = term_completeness_reporter(inputs)
        #print 'response:\n%s' % response
        success = response['success']
        s = 'Term completeness counter failed: %s' % response['message']
        self.assertTrue(success, s)

        outputfile = response['outputfile']
        #print 'response:\n%s' % response
        s = 'Output file %s not created' % outputfile
        self.assertTrue(os.path.isfile(outputfile), s)

        header = read_header(outputfile)

        rows = count_rows(outputfile)
        expected = 24
        s = 'Number of rows in %s ' % outputfile
        s += 'was %s, not as expected (%s) ' % (rows, expected)
        self.assertEqual(rows, expected, s)

        expected = ['field', 'count']
        s = 'Header: %s, not as expected: %s' % (header, expected)
        self.assertEqual(header, expected, s)
    def test_source_headers_correct(self):
        print 'testing source_headers_correct'
        dwca = self.framework.dwca
        workspace = self.framework.testdatapath
        outputfile = self.framework.outputfile
        archivetype = self.framework.archivetype

        inputs = {}
        inputs['inputfile'] = dwca
        inputs['outputfile'] = outputfile
        inputs['workspace'] = workspace
        inputs['archivetype'] = archivetype

        response = dwca_core_to_tsv(inputs)
        #print 'response:\n%s' % response

        outputfilefullpath = response['outputfile']
        header = read_header(outputfilefullpath, tsv_dialect())
        modelheader = []
        modelheader.append('type')
        modelheader.append('modified')
        modelheader.append('language')
        modelheader.append('accessRights')
        modelheader.append('references')
        modelheader.append('institutionCode')
        modelheader.append('collectionCode')
        modelheader.append('basisOfRecord')
        modelheader.append('informationWithheld')
        modelheader.append('dynamicProperties')
        modelheader.append('occurrenceID')
        modelheader.append('catalogNumber')
        modelheader.append('recordNumber')
        modelheader.append('recordedBy')
        modelheader.append('individualCount')
        modelheader.append('sex')
        modelheader.append('lifeStage')
        modelheader.append('establishmentMeans')
        modelheader.append('preparations')
        modelheader.append('associatedMedia')
        modelheader.append('associatedSequences')
        modelheader.append('associatedTaxa')
        modelheader.append('otherCatalogNumbers')
        modelheader.append('occurrenceRemarks')
        modelheader.append('associatedOccurrences')
        modelheader.append('previousIdentifications')
        modelheader.append('fieldNumber')
        modelheader.append('eventDate')
        modelheader.append('eventTime')
        modelheader.append('endDayOfYear')
        modelheader.append('year')
        modelheader.append('month')
        modelheader.append('day')
        modelheader.append('verbatimEventDate')
        modelheader.append('habitat')
        modelheader.append('samplingProtocol')
        modelheader.append('eventRemarks')
        modelheader.append('higherGeography')
        modelheader.append('continent')
        modelheader.append('waterBody')
        modelheader.append('islandGroup')
        modelheader.append('island')
        modelheader.append('country')
        modelheader.append('stateProvince')
        modelheader.append('county')
        modelheader.append('locality')
        modelheader.append('verbatimLocality')
        modelheader.append('minimumElevationInMeters')
        modelheader.append('maximumElevationInMeters')
        modelheader.append('minimumDepthInMeters')
        modelheader.append('maximumDepthInMeters')
        modelheader.append('locationAccordingTo')
        modelheader.append('locationRemarks')
        modelheader.append('decimalLatitude')
        modelheader.append('decimalLongitude')
        modelheader.append('geodeticDatum')
        modelheader.append('coordinateUncertaintyInMeters')
        modelheader.append('verbatimCoordinates')
        modelheader.append('verbatimCoordinateSystem')
        modelheader.append('georeferencedBy')
        modelheader.append('georeferencedDate')
        modelheader.append('georeferenceProtocol')
        modelheader.append('georeferenceSources')
        modelheader.append('georeferenceVerificationStatus')
        modelheader.append('identificationQualifier')
        modelheader.append('typeStatus')
        modelheader.append('identifiedBy')
        modelheader.append('dateIdentified')
        modelheader.append('identificationReferences')
        modelheader.append('identificationVerificationStatus')
        modelheader.append('identificationRemarks')
        modelheader.append('scientificName')
        modelheader.append('higherClassification')
        modelheader.append('kingdom')
        modelheader.append('phylum')
        modelheader.append('class')
        modelheader.append('order')
        modelheader.append('family')
        modelheader.append('genus')
        modelheader.append('specificEpithet')
        modelheader.append('infraspecificEpithet')
        modelheader.append('taxonRank')
        modelheader.append('nomenclaturalCode')
        modelheader.append('individualID')
        modelheader.append('rights')

        self.assertEqual(len(header), 85,
                         'incorrect number of fields in header')
        s = 'Header:\n%s\nnot equal to the model header:\n%s' % (header,
                                                                 modelheader)
        self.assertEqual(header, modelheader, s)
    def test_term_setter_report(self):
        print 'testing term_setter_report'
        testsetterinputfile = self.framework.testsetterinputfile
        testsetterreportfile = self.framework.testsetterreportfile

        # Test field addition
        key = 'institutionCode'
        result = term_setter_report(testsetterinputfile,
                                    testsetterreportfile,
                                    key,
                                    constantvalues='CAS')
        s = 'term_setter_report() result not True '
        s += 'with inputfile: %s ' % testsetterinputfile
        s += 'and outputfile: %s' % testsetterreportfile
        self.assertTrue(result, s)

        outputheader = read_header(testsetterreportfile)
        expected = ['ID', 'month', 'country', 'institutionCode']
        s = 'outputheader: %s not as expected: %s' % (outputheader, expected)
        self.assertEqual(outputheader, expected, s)

        dialect = csv_file_dialect(testsetterreportfile)
        encoding = csv_file_encoding(testsetterreportfile)
        rows = read_rows(testsetterreportfile,
                         1,
                         dialect=dialect,
                         encoding=encoding,
                         header=True,
                         fieldnames=outputheader)
        firstrow = rows[0]

        field = 'institutionCode'
        value = firstrow[field]
        expected = 'CAS'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)

        # Test field list addition
        key = 'institutionCode|license'
        result = term_setter_report(testsetterinputfile,
                                    testsetterreportfile,
                                    key,
                                    constantvalues='CAS|CC0')
        s = 'term_setter_report() result not True '
        s += 'with inputfile: %s ' % testsetterinputfile
        s += 'and outputfile: %s' % testsetterreportfile
        self.assertTrue(result, s)

        outputheader = read_header(testsetterreportfile)
        expected = ['ID', 'month', 'country', 'institutionCode', 'license']
        s = 'outputheader: %s not as expected: %s' % (outputheader, expected)
        self.assertEqual(outputheader, expected, s)

        dialect = csv_file_dialect(testsetterreportfile)
        encoding = csv_file_encoding(testsetterreportfile)
        rows = read_rows(testsetterreportfile,
                         1,
                         dialect=dialect,
                         encoding=encoding,
                         header=True,
                         fieldnames=outputheader)
        firstrow = rows[0]

        field = 'institutionCode'
        value = firstrow[field]
        expected = 'CAS'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)

        field = 'license'
        value = firstrow[field]
        expected = 'CC0'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)

        # Test field replacement
        key = 'country'
        result = term_setter_report(testsetterinputfile,
                                    testsetterreportfile,
                                    key,
                                    constantvalues='Argentina')
        s = 'term_setter_report() result not True '
        s += 'with inputfile: %s ' % testsetterinputfile
        s += 'and outputfile: %s' % testsetterreportfile
        self.assertTrue(result, s)

        outputheader = read_header(testsetterreportfile)
        expected = ['ID', 'month', 'country']
        s = 'outputheader: %s not as expected: %s' % (outputheader, expected)
        self.assertEqual(outputheader, expected, s)

        dialect = csv_file_dialect(testsetterreportfile)
        encoding = csv_file_encoding(testsetterreportfile)
        rows = read_rows(testsetterreportfile,
                         1,
                         dialect=dialect,
                         encoding=encoding,
                         header=True,
                         fieldnames=outputheader)
        firstrow = rows[0]

        field = 'country'
        value = firstrow[field]
        expected = 'Argentina'
        s = 'Field %s value %s not as expected (%s)' % (field, value, expected)
        self.assertEqual(value, expected, s)
Пример #14
0
    def test_darwinize_header(self):
        print 'testing darwinize_header'
        testfile1 = self.framework.testfile1
        testfile2 = self.framework.testfile2
        testfile3 = self.framework.testfile3
        testdatapath = self.framework.testdatapath
        dwccloudfile = self.framework.dwccloudfile
        outputfile = self.framework.outputfile

        inputs = {}
        inputs['inputfile'] = testfile1
        inputs['dwccloudfile'] = dwccloudfile
        inputs['outputfile'] = outputfile
        inputs['workspace'] = testdatapath

        # Darwinize the header
        response = darwinize_header(inputs)
        outfilelocation = '%s/%s' % (testdatapath, outputfile)
        header = read_header(outfilelocation)
        #print 'inputs1:\n%s' % inputs
        #print 'response1:\n%s' % response
        expected = [
            'catalogNumber', 'recordedBy', 'fieldNumber', 'year', 'month',
            'day', 'decimalLatitude', 'decimalLongitude', 'geodeticDatum',
            'country', 'stateProvince', 'county', 'locality', 'family',
            'scientificName', 'scientificNameAuthorship',
            'reproductiveCondition', 'institutionCode', 'collectionCode',
            'datasetName', 'Id'
        ]
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, header,
                                                           expected)
        self.assertEqual(header, expected, s)

        # What is not Darwin Core?
        casesensitive = True
        notdwc = terms_not_in_dwc(header, casesensitive)
        expected = ['Id']
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc,
                                                           expected)
        self.assertEqual(notdwc, expected, s)

        inputs['inputfile'] = testfile2

        # Darwinize the header
        response = darwinize_header(inputs)
        header = read_header(outfilelocation)
        #print 'response2:\n%s' % response
        expected = [
            'materialSampleID', 'principalInvestigator', 'locality', 'phylum',
            'decimalLatitude', 'decimalLongitude',
            'coordinateUncertaintyInMeters', 'georeferenceProtocol', 'year',
            'month', 'day', 'genus', 'specificEpithet', 'permitInformation',
            'basisOfIdentification', 'taxonID', 'country', 'stateProvince',
            'island', 'islandGroup', 'sampleOwnerInstitutionCode',
            'fundingSource', 'occurrenceID', 'associatedMedia',
            'associatedReferences', 'preservative', 'previousIdentifications',
            'lifeStage', 'weight', 'length', 'sex', 'establishmentMeans',
            'associatedSequences', 'occurrenceRemarks', 'habitat',
            'microHabitat', 'substratum', 'samplingProtocol',
            'minimumDepthInMeters', 'maximumDepthInMeters',
            'minimumDistanceAboveSurfaceInMeters',
            'maximumDistanceAboveSurfaceInMeters', 'associatedTaxa',
            'fieldNotes', 'eventRemarks', 'recordedBy', 'identifiedBy',
            'yearIdentified', 'monthIdentified', 'dayIdentified', 'class',
            'order', 'family', 'infraspecificEpithet', 'vernacularName',
            'taxonRemarks', 'geneticTissueType', 'plateID', 'wellID',
            'extractionID', 'otherCatalogNumbers', 'tissueStorageID', 'BCID',
            'UNNAMED_COLUMN_1'
        ]
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile2, header,
                                                           expected)
        self.assertEqual(header, expected, s)

        # What is not Darwin Core?
        casesensitive = True
        notdwc = terms_not_in_dwc(header, casesensitive)
        expected = [
            'BCID', 'UNNAMED_COLUMN_1', 'basisOfIdentification',
            'dayIdentified', 'extractionID', 'fundingSource',
            'geneticTissueType', 'length', 'microHabitat', 'monthIdentified',
            'permitInformation', 'plateID', 'preservative',
            'principalInvestigator', 'sampleOwnerInstitutionCode',
            'substratum', 'tissueStorageID', 'weight', 'wellID',
            'yearIdentified'
        ]
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc,
                                                           expected)
        self.assertEqual(notdwc, expected, s)

        inputs['inputfile'] = testfile3

        # Darwinize the header
        response = darwinize_header(inputs)
        header = read_header(outfilelocation)
        #print 'response2:\n%s' % response
        expected = [
            'id', 'institutionCode', 'collectionCode', 'basisOfRecord',
            'occurrenceID', 'catalogNumber', 'otherCatalogNumbers', 'kingdom',
            'phylum', 'class', 'order', 'family', 'scientificName',
            'scientificNameAuthorship', 'genus', 'specificEpithet',
            'taxonRank', 'infraspecificEpithet', 'identifiedBy',
            'dateIdentified', 'identificationReferences',
            'identificationRemarks', 'taxonRemarks', 'identificationQualifier',
            'typeStatus', 'recordedBy', 'recordedByID', 'associatedCollectors',
            'recordNumber', 'eventDate', 'year', 'month', 'day',
            'startDayOfYear', 'endDayOfYear', 'verbatimEventDate',
            'occurrenceRemarks', 'habitat', 'substrate', 'verbatimAttributes',
            'fieldNumber', 'informationWithheld', 'dataGeneralizations',
            'dynamicProperties', 'associatedTaxa', 'reproductiveCondition',
            'establishmentMeans', 'cultivationStatus', 'lifeStage', 'sex',
            'individualCount', 'samplingProtocol', 'samplingEffort',
            'preparations', 'country', 'stateProvince', 'county',
            'municipality', 'locality', 'locationRemarks', 'localitySecurity',
            'localitySecurityReason', 'decimalLatitude', 'decimalLongitude',
            'geodeticDatum', 'coordinateUncertaintyInMeters',
            'verbatimCoordinates', 'georeferencedBy', 'georeferenceProtocol',
            'georeferenceSources', 'georeferenceVerificationStatus',
            'georeferenceRemarks', 'minimumElevationInMeters',
            'maximumElevationInMeters', 'minimumDepthInMeters',
            'maximumDepthInMeters', 'verbatimDepth', 'verbatimElevation',
            'disposition', 'language', 'recordEnteredBy', 'modified',
            'sourcePrimaryKey', 'collId', 'recordId', 'references'
        ]
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile3, header,
                                                           expected)
        self.maxDiff = None
        self.assertEqual(header, expected, s)

        # What is not Darwin Core?
        casesensitive = True
        notdwc = terms_not_in_dwc(header, casesensitive)
        expected = [
            'associatedCollectors', 'collId', 'cultivationStatus', 'id',
            'localitySecurity', 'localitySecurityReason', 'recordEnteredBy',
            'recordId', 'recordedByID', 'sourcePrimaryKey', 'substrate',
            'verbatimAttributes'
        ]
        s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc,
                                                           expected)
        self.assertEqual(notdwc, expected, s)
Пример #15
0
    def test_headers(self):
        print 'testing headers'
        testfile = self.framework.testfile1

        header = read_header(testfile)
        expected = [
            'catalogNumber ', 'recordedBy', 'fieldNumber ', 'year', 'month',
            'day', 'decimalLatitude ', 'decimalLongitude ', 'geodeticDatum ',
            'country', 'stateProvince', 'county', 'locality', 'family ',
            'scientificName ', 'scientificNameAuthorship ',
            'reproductiveCondition ', 'InstitutionCode ', 'CollectionCode ',
            'DatasetName ', 'Id'
        ]
        s = 'test file %s header:\n%s does not match expected:\n%s' % (
            testfile, header, expected)
        self.assertEqual(header, expected)

        testfile = self.framework.testfile2
        header = read_header(testfile)
        expected = [
            'materialSampleID', 'principalInvestigator', 'locality', 'phylum',
            'decimalLatitude', 'decimalLongitude',
            'coordinateUncertaintyInMeters', 'georeferenceProtocol',
            'yearCollected', 'monthCollected', 'dayCollected', 'genus',
            'species', 'permitInformation', 'basisOfIdentification', 'wormsID',
            'country', 'stateProvince', 'island', 'islandGroup',
            'sampleOwnerInstitutionCode', 'fundingSource', 'occurrenceID',
            'associatedMedia', 'associatedReferences', 'preservative',
            'previousIdentifications', 'lifeStage', 'weight', 'length', 'sex',
            'establishmentMeans', 'associatedSequences', 'occurrenceRemarks',
            'habitat', 'microHabitat', 'substratum', 'samplingProtocol',
            'minimumDepthInMeters', 'maximumDepthInMeters',
            'minimumDistanceAboveSurfaceInMeters',
            'maximumDistanceAboveSurfaceInMeters', 'associatedTaxa',
            'fieldNotes', 'eventRemarks', 'recordedBy', 'identifiedBy',
            'yearIdentified', 'monthIdentified', 'dayIdentified', 'class',
            'order', 'family', 'subSpecies', 'vernacularName', 'taxonRemarks',
            'geneticTissueType', 'plateID', 'wellID', 'extractionID',
            'previousTissueID', 'tissueStorageID', 'BCID', ''
        ]
        s = 'test file %s header:\n%s does not match expected:\n%s' % \
            (testfile, header, expected)
        self.assertEqual(header, expected)

        testfile = self.framework.testfile3

        header = read_header(testfile)
        expected = [
            'id', 'institutionCode', 'collectionCode', 'basisOfRecord',
            'occurrenceID', 'catalogNumber', 'otherCatalogNumbers', 'kingdom',
            'phylum', 'class', 'order', 'family', 'scientificName',
            'scientificNameAuthorship', 'genus', 'specificEpithet',
            'taxonRank', 'infraspecificEpithet', 'identifiedBy',
            'dateIdentified', 'identificationReferences',
            'identificationRemarks', 'taxonRemarks', 'identificationQualifier',
            'typeStatus', 'recordedBy', 'recordedByID', 'associatedCollectors',
            'recordNumber', 'eventDate', 'year', 'month', 'day',
            'startDayOfYear', 'endDayOfYear', 'verbatimEventDate',
            'occurrenceRemarks', 'habitat', 'substrate', 'verbatimAttributes',
            'fieldNumber', 'informationWithheld', 'dataGeneralizations',
            'dynamicProperties', 'associatedTaxa', 'reproductiveCondition',
            'establishmentMeans', 'cultivationStatus', 'lifeStage', 'sex',
            'individualCount', 'samplingProtocol', 'samplingEffort',
            'preparations', 'country', 'stateProvince', 'county',
            'municipality', 'locality', 'locationRemarks', 'localitySecurity',
            'localitySecurityReason', 'decimalLatitude', 'decimalLongitude',
            'geodeticDatum', 'coordinateUncertaintyInMeters',
            'verbatimCoordinates', 'georeferencedBy', 'georeferenceProtocol',
            'georeferenceSources', 'georeferenceVerificationStatus',
            'georeferenceRemarks', 'minimumElevationInMeters',
            'maximumElevationInMeters', 'minimumDepthInMeters',
            'maximumDepthInMeters', 'verbatimDepth', 'verbatimElevation',
            'disposition', 'language', 'recordEnteredBy', 'modified',
            'sourcePrimaryKey', 'collId', 'recordId', 'references'
        ]
        s = 'test file %s header:\n%s does not match expected:\n%s' % (
            testfile, header, expected)
        self.assertEqual(header, expected)
    def test_aggregate_mix(self):
        print 'testing aggregate_mix'
        tsvfile = self.framework.tsvfile
        mixedcompositepath = self.framework.mixedcompositepath
        workspace = self.framework.testdatapath

        inputs = {}
        inputs['inputpath'] = mixedcompositepath
        inputs['outputfile'] = self.framework.tsvfile
        inputs['workspace'] = workspace
        inputs['format'] = 'txt'

        # Aggregate text file
        response = text_file_aggregator(inputs)

        outputfile = response['outputfile']
        self.assertTrue(os.path.isfile(outputfile),
                        outputfile + ' does not exist')

        self.assertEqual(response['aggregaterowcount'], 19,
                         'incorrect number of rows')

        header = read_header(outputfile)
        modelheader = []
        modelheader.append('BCID')
        modelheader.append('CollectionCode')
        modelheader.append('DatasetName')
        modelheader.append('Id')
        modelheader.append('InstitutionCode')
        modelheader.append('associatedMedia')
        modelheader.append('associatedReferences')
        modelheader.append('associatedSequences')
        modelheader.append('associatedTaxa')
        modelheader.append('basisOfIdentification')
        modelheader.append('catalogNumber')
        modelheader.append('class')
        modelheader.append('coordinateUncertaintyInMeters')
        modelheader.append('country')
        modelheader.append('county')
        modelheader.append('day')
        modelheader.append('dayCollected')
        modelheader.append('dayIdentified')
        modelheader.append('decimalLatitude')
        modelheader.append('decimalLongitude')
        modelheader.append('establishmentMeans')
        modelheader.append('eventRemarks')
        modelheader.append('extractionID')
        modelheader.append('family')
        modelheader.append('fieldNotes')
        modelheader.append('fieldNumber')
        modelheader.append('fundingSource')
        modelheader.append('geneticTissueType')
        modelheader.append('genus')
        modelheader.append('geodeticDatum')
        modelheader.append('georeferenceProtocol')
        modelheader.append('habitat')
        modelheader.append('identifiedBy')
        modelheader.append('island')
        modelheader.append('islandGroup')
        modelheader.append('length')
        modelheader.append('lifeStage')
        modelheader.append('locality')
        modelheader.append('materialSampleID')
        modelheader.append('maximumDepthInMeters')
        modelheader.append('maximumDistanceAboveSurfaceInMeters')
        modelheader.append('microHabitat')
        modelheader.append('minimumDepthInMeters')
        modelheader.append('minimumDistanceAboveSurfaceInMeters')
        modelheader.append('month')
        modelheader.append('monthCollected')
        modelheader.append('monthIdentified')
        modelheader.append('occurrenceID')
        modelheader.append('occurrenceRemarks')
        modelheader.append('order')
        modelheader.append('permitInformation')
        modelheader.append('phylum')
        modelheader.append('plateID')
        modelheader.append('preservative')
        modelheader.append('previousIdentifications')
        modelheader.append('previousTissueID')
        modelheader.append('principalInvestigator')
        modelheader.append('recordedBy')
        modelheader.append('reproductiveCondition')
        modelheader.append('sampleOwnerInstitutionCode')
        modelheader.append('samplingProtocol')
        modelheader.append('scientificName')
        modelheader.append('scientificNameAuthorship')
        modelheader.append('sex')
        modelheader.append('species')
        modelheader.append('stateProvince')
        modelheader.append('subSpecies')
        modelheader.append('substratum')
        modelheader.append('taxonRemarks')
        modelheader.append('tissueStorageID')
        modelheader.append('vernacularName')
        modelheader.append('weight')
        modelheader.append('wellID')
        modelheader.append('wormsID')
        modelheader.append('year')
        modelheader.append('yearCollected')
        modelheader.append('yearIdentified')

        #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n\n%s' % (len(header), len(modelheader), header, modelheader)
        self.assertEqual(len(header), 77,
                         'incorrect number of fields in header')
        self.assertEqual(header, modelheader,
                         'header not equal to the model header')