def test_source_headers_correct(self): print 'testing source_headers_correct' tsvfile1 = self.framework.tsvtest1 tsvfile2 = self.framework.tsvtest2 csvfile1 = self.framework.csvtest1 csvfile2 = self.framework.csvtest2 header = read_header(tsvfile1) modelheader = [] modelheader.append('materialSampleID') modelheader.append('principalInvestigator') modelheader.append('locality') modelheader.append('phylum') modelheader.append('') #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 5, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header') header = read_header(tsvfile2) modelheader = [] modelheader.append('materialSampleID') modelheader.append('principalInvestigator') modelheader.append('locality') modelheader.append('phylum') modelheader.append('decimalLatitude') modelheader.append('decimalLongitude') #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 6, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header') header = read_header(csvfile2) modelheader = [] modelheader.append('materialSampleID') modelheader.append('principalInvestigator') modelheader.append('locality') modelheader.append('phylum') modelheader.append('decimalLatitude') modelheader.append('decimalLongitude') #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 6, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header') header = read_header(csvfile1) modelheader = [] modelheader.append('materialSampleID') modelheader.append('principalInvestigator') modelheader.append('locality') modelheader.append('phylum') modelheader.append('') #print 'len(header)=%s len(model)=%s\nheader:\nmodel:%s\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 5, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header')
def test_vocab_appender(self): print 'testing vocab_appender' testvocabfile = self.framework.testvocabfile geogkey = compose_key_from_list(geogkeytermlist) g1 = 'Oceania|United States|US|Hawaii|Honolulu|Honolulu|' g1 += 'North Pacific Ocean|Hawaiian Islands|Oahu' g2 = '|United States||WA|Chelan Co.||||' n = [g1, g2] inputs = {} inputs['vocabfile'] = testvocabfile inputs['key'] = geogkey inputs['checkvaluelist'] = n # print 'inputs:\n%s' % inputs # Add new vocab to new vocab file response = vocab_appender(inputs) # print 'response1:\n%s' % response writtenlist = response['addedvalues'] #print 'writtenlist1: %s' % writtenlist self.assertEqual(writtenlist, n, 'values not written to new testvocabfile') header = read_header(testvocabfile) #print 'vocab file header:\n%s' % header # Attempt to add same vocabs to the same vocabs file response = vocab_appender(inputs) #print 'response2:\n%s' % response writtenlist = response['addedvalues'] #print 'writtenlist2: %s' % writtenlist self.assertIsNone(writtenlist, 'duplicate value written to testvocabfile') header = read_header(testvocabfile) #print 'vocab file header:\n%s' % header self.assertEquals(header[0], geogkey, 'key field not correct in testvocabfile')
def test_term_exists(self): print 'testing term_exists' testfile = self.framework.testfile1 header = read_header(testfile) term = 'year' present = term in header s = 'test file %s does not contain "%s" field' % (testfile, term) self.assertTrue(present, s) term = 'fieldNumber ' present = term in header s = 'test file %s does not contain "%s" field' % (testfile, term) self.assertTrue(present, s) testfile = self.framework.testfile2 header = read_header(testfile) term = 'month' present = term in header s = 'test file %s does not contain "%s" field' % (testfile, term) self.assertTrue(present, s)
def test_text_file_filter(self): print 'testing text_file_filter' testinputfile = self.framework.testinputfile testreportfile = self.framework.testreportfile workspace = self.framework.testdatapath outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile) termname = 'year' matchingvalue = '1990' inputs = {} inputs['inputfile'] = testinputfile inputs['termname'] = termname inputs['matchingvalue'] = matchingvalue inputs['workspace'] = workspace inputs['outputfile'] = testreportfile # Create the report #print 'inputs:\n%s' % inputs response = text_file_filter(inputs) #print 'response:\n%s' % response success = response['success'] s = 'text file filter failed: %s' % response['message'] self.assertTrue(success, s) outputfile = response['outputfile'] #print 'response:\n%s' % response s = 'Output file %s not created' % outputfile self.assertTrue(os.path.isfile(outputfile), s) header = read_header(outputfile) dialect = csv_file_dialect(outputfile) encoding = csv_file_encoding(outputfile) matches = 0 # Iterate through all rows in the input file for row in read_csv_row(outputfile, dialect=dialect, encoding=encoding, header=True, fieldnames=header): #print 'row: %s' % row if row[termname] == matchingvalue: matches += 1 expected = 5 s = 'Number of matches in output (%s) not as expected (%s)' % ( matches, expected) self.assertEqual(matches, expected, s) matches = count_rows(outputfile) expected = 7 s = 'Number of matches of %s in %s ' % (matchingvalue, outputfile) s += 'was %s, not as expected (%s) ' % (matches, expected) self.assertEqual(matches, expected, s)
def test_writevocabheader(self): print 'testing writevocabheader' writevocabheadertestfile = self.framework.writevocabheadertestfile fieldnames = ['country|stateprovince|county', 'standard', 'vetted', 'error', 'misplaced', 'unresolved', 'source', 'comment'] dialect = vocab_dialect() success = writevocabheader(writevocabheadertestfile, fieldnames, dialect) self.assertTrue(success,'vocab header not written') header = read_header(writevocabheadertestfile) expected = ['country|stateprovince|county', 'standard', 'vetted', 'error', 'misplaced', 'unresolved', 'source', 'comment'] s = 'header:\n%s\nfrom file: %s\nnot as expected:\n%s' \ % (header,writevocabheadertestfile,expected) self.assertEqual(header, expected, s)
def test_read_vocab_header(self): print 'testing read_vocab_header' vocabdialect = vocab_dialect() vocabencoding = 'utf-8' monthvocabfile = self.framework.monthvocabfile header = read_header(monthvocabfile, vocabdialect, vocabencoding) found = len(header) expected = 3 s = 'Found %s fields in header. Expected %s' % (found, expected) self.assertEqual(found, expected, s) expected = ['month'] + vocabfieldlist s = 'File: %s\nheader: %s\n' % (monthvocabfile, header) s += 'not as expected: %s' % expected self.assertEqual(header, expected, s)
def test_vocab_headers_correct(self): print 'testing vocab_headers_correct' vocabpath = self.framework.vocabpath vocabdialect = vocab_dialect() vocabencoding = 'utf-8' for field in controlledtermlist: vocabfile = vocabpath + field + '.txt' if not os.path.isfile(vocabfile): success = writevocabheader(vocabfile, vocabfieldlist, vocabdialect, vocabencoding) header = read_header(vocabfile, vocabdialect, vocabencoding) expected = [field.lower()] + vocabfieldlist s = 'File: %s\nheader: %s\n' % (vocabfile, header) s += 'not as expected: %s' % expected self.assertEqual(header, expected, s)
def test_term_standardizer_report(self): print 'testing term_standardizer_report' testcorrectioninputfile = self.framework.testcorrectioninputfile testcorrectionreportfile = self.framework.testcorrectionreportfile testmonthvocabfile = self.framework.testmonthvocabfile key = 'month' result = term_standardizer_report(testcorrectioninputfile, \ testcorrectionreportfile, testmonthvocabfile, key) s = 'term_standardizer_report() result not True ' s += 'with inputfile: %s ' % testcorrectioninputfile s += 'outpufile: %s' % testcorrectionreportfile s += 'and vocabfile: %s' % testmonthvocabfile self.assertTrue(result, s) outputheader = read_header(testcorrectionreportfile) expected = ['ID', 'month', 'country', 'month_orig'] s = 'outputheader: %s not as expected: %s' % (outputheader, expected) self.assertEqual(outputheader, expected, s) dialect = csv_file_dialect(testcorrectionreportfile) encoding = csv_file_encoding(testcorrectionreportfile) rows = read_rows(testcorrectionreportfile, 1, dialect=dialect, encoding=encoding, header=True, fieldnames=outputheader) firstrow = rows[0] field = 'month_orig' value = firstrow[field] expected = 'vi' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s) field = 'month' value = firstrow[field] expected = '6' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s)
def test_text_file_field_stripper(self): print 'testing text_file_field_stripper' testinputfile = self.framework.testinputfile testreportfile = self.framework.testreportfile workspace = self.framework.testdatapath outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile) termlist = 'country|stateProvince' inputs = {} inputs['inputfile'] = testinputfile inputs['termlist'] = termlist inputs['workspace'] = workspace inputs['outputfile'] = testreportfile inputs['separator'] = '|' # Create the report #print 'inputs:\n%s' % inputs response = text_file_field_stripper(inputs) #print 'response:\n%s' % response success = response['success'] s = 'text file filter failed: %s' % response['message'] self.assertTrue(success, s) outputfile = response['outputfile'] #print 'response:\n%s' % response s = 'Output file %s not created' % outputfile self.assertTrue(os.path.isfile(outputfile), s) header = read_header(outputfile) dialect = csv_file_dialect(outputfile) encoding = csv_file_encoding(outputfile) rows = count_rows(outputfile) expected = 10 s = 'Number of rows in %s ' % outputfile s += 'was %s, not as expected (%s) ' % (rows, expected) self.assertEqual(rows, expected, s) expected = ['country', 'stateprovince'] s = 'Header: %s, not as expected: %s' % (header, expected) self.assertEqual(header, expected, s)
def test_aggregate_tsvs(self): print 'testing aggregate_tsvs' tsvfile = self.framework.tsvfile tsvcompositepath = self.framework.tsvcompositepath workspace = self.framework.testdatapath inputs = {} inputs['inputpath'] = tsvcompositepath inputs['outputfile'] = tsvfile inputs['inputdialect'] = 'tsv' inputs['workspace'] = workspace #print 'inputs:\n%s' % (inputs) # Aggregate text file response = text_file_aggregator(inputs) #print 'inputs:\n%s\nresponse:\n%s' % (inputs, response) outputfile = response['outputfile'] self.assertTrue(os.path.isfile(outputfile), outputfile + ' does not exist') self.assertEqual(response['aggregaterowcount'], 6, 'incorrect number of rows') header = read_header(outputfile) modelheader = [] modelheader.append('decimalLatitude') modelheader.append('decimalLongitude') modelheader.append('locality') modelheader.append('materialSampleID') modelheader.append('phylum') modelheader.append('principalInvestigator') #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 6, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header')
def test_term_completeness_reporter(self): print 'testing term_completeness_reporter' testinputfile = self.framework.testinputfile testreportfile = self.framework.testreportfile workspace = self.framework.testdatapath outputfile = '%s/%s' % (workspace.rstrip('/'), testreportfile) inputs = {} inputs['inputfile'] = testinputfile inputs['workspace'] = workspace inputs['outputfile'] = testreportfile # Create the report #print 'inputs:\n%s' % inputs response = term_completeness_reporter(inputs) #print 'response:\n%s' % response success = response['success'] s = 'Term completeness counter failed: %s' % response['message'] self.assertTrue(success, s) outputfile = response['outputfile'] #print 'response:\n%s' % response s = 'Output file %s not created' % outputfile self.assertTrue(os.path.isfile(outputfile), s) header = read_header(outputfile) rows = count_rows(outputfile) expected = 24 s = 'Number of rows in %s ' % outputfile s += 'was %s, not as expected (%s) ' % (rows, expected) self.assertEqual(rows, expected, s) expected = ['field', 'count'] s = 'Header: %s, not as expected: %s' % (header, expected) self.assertEqual(header, expected, s)
def test_source_headers_correct(self): print 'testing source_headers_correct' dwca = self.framework.dwca workspace = self.framework.testdatapath outputfile = self.framework.outputfile archivetype = self.framework.archivetype inputs = {} inputs['inputfile'] = dwca inputs['outputfile'] = outputfile inputs['workspace'] = workspace inputs['archivetype'] = archivetype response = dwca_core_to_tsv(inputs) #print 'response:\n%s' % response outputfilefullpath = response['outputfile'] header = read_header(outputfilefullpath, tsv_dialect()) modelheader = [] modelheader.append('type') modelheader.append('modified') modelheader.append('language') modelheader.append('accessRights') modelheader.append('references') modelheader.append('institutionCode') modelheader.append('collectionCode') modelheader.append('basisOfRecord') modelheader.append('informationWithheld') modelheader.append('dynamicProperties') modelheader.append('occurrenceID') modelheader.append('catalogNumber') modelheader.append('recordNumber') modelheader.append('recordedBy') modelheader.append('individualCount') modelheader.append('sex') modelheader.append('lifeStage') modelheader.append('establishmentMeans') modelheader.append('preparations') modelheader.append('associatedMedia') modelheader.append('associatedSequences') modelheader.append('associatedTaxa') modelheader.append('otherCatalogNumbers') modelheader.append('occurrenceRemarks') modelheader.append('associatedOccurrences') modelheader.append('previousIdentifications') modelheader.append('fieldNumber') modelheader.append('eventDate') modelheader.append('eventTime') modelheader.append('endDayOfYear') modelheader.append('year') modelheader.append('month') modelheader.append('day') modelheader.append('verbatimEventDate') modelheader.append('habitat') modelheader.append('samplingProtocol') modelheader.append('eventRemarks') modelheader.append('higherGeography') modelheader.append('continent') modelheader.append('waterBody') modelheader.append('islandGroup') modelheader.append('island') modelheader.append('country') modelheader.append('stateProvince') modelheader.append('county') modelheader.append('locality') modelheader.append('verbatimLocality') modelheader.append('minimumElevationInMeters') modelheader.append('maximumElevationInMeters') modelheader.append('minimumDepthInMeters') modelheader.append('maximumDepthInMeters') modelheader.append('locationAccordingTo') modelheader.append('locationRemarks') modelheader.append('decimalLatitude') modelheader.append('decimalLongitude') modelheader.append('geodeticDatum') modelheader.append('coordinateUncertaintyInMeters') modelheader.append('verbatimCoordinates') modelheader.append('verbatimCoordinateSystem') modelheader.append('georeferencedBy') modelheader.append('georeferencedDate') modelheader.append('georeferenceProtocol') modelheader.append('georeferenceSources') modelheader.append('georeferenceVerificationStatus') modelheader.append('identificationQualifier') modelheader.append('typeStatus') modelheader.append('identifiedBy') modelheader.append('dateIdentified') modelheader.append('identificationReferences') modelheader.append('identificationVerificationStatus') modelheader.append('identificationRemarks') modelheader.append('scientificName') modelheader.append('higherClassification') modelheader.append('kingdom') modelheader.append('phylum') modelheader.append('class') modelheader.append('order') modelheader.append('family') modelheader.append('genus') modelheader.append('specificEpithet') modelheader.append('infraspecificEpithet') modelheader.append('taxonRank') modelheader.append('nomenclaturalCode') modelheader.append('individualID') modelheader.append('rights') self.assertEqual(len(header), 85, 'incorrect number of fields in header') s = 'Header:\n%s\nnot equal to the model header:\n%s' % (header, modelheader) self.assertEqual(header, modelheader, s)
def test_term_setter_report(self): print 'testing term_setter_report' testsetterinputfile = self.framework.testsetterinputfile testsetterreportfile = self.framework.testsetterreportfile # Test field addition key = 'institutionCode' result = term_setter_report(testsetterinputfile, testsetterreportfile, key, constantvalues='CAS') s = 'term_setter_report() result not True ' s += 'with inputfile: %s ' % testsetterinputfile s += 'and outputfile: %s' % testsetterreportfile self.assertTrue(result, s) outputheader = read_header(testsetterreportfile) expected = ['ID', 'month', 'country', 'institutionCode'] s = 'outputheader: %s not as expected: %s' % (outputheader, expected) self.assertEqual(outputheader, expected, s) dialect = csv_file_dialect(testsetterreportfile) encoding = csv_file_encoding(testsetterreportfile) rows = read_rows(testsetterreportfile, 1, dialect=dialect, encoding=encoding, header=True, fieldnames=outputheader) firstrow = rows[0] field = 'institutionCode' value = firstrow[field] expected = 'CAS' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s) # Test field list addition key = 'institutionCode|license' result = term_setter_report(testsetterinputfile, testsetterreportfile, key, constantvalues='CAS|CC0') s = 'term_setter_report() result not True ' s += 'with inputfile: %s ' % testsetterinputfile s += 'and outputfile: %s' % testsetterreportfile self.assertTrue(result, s) outputheader = read_header(testsetterreportfile) expected = ['ID', 'month', 'country', 'institutionCode', 'license'] s = 'outputheader: %s not as expected: %s' % (outputheader, expected) self.assertEqual(outputheader, expected, s) dialect = csv_file_dialect(testsetterreportfile) encoding = csv_file_encoding(testsetterreportfile) rows = read_rows(testsetterreportfile, 1, dialect=dialect, encoding=encoding, header=True, fieldnames=outputheader) firstrow = rows[0] field = 'institutionCode' value = firstrow[field] expected = 'CAS' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s) field = 'license' value = firstrow[field] expected = 'CC0' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s) # Test field replacement key = 'country' result = term_setter_report(testsetterinputfile, testsetterreportfile, key, constantvalues='Argentina') s = 'term_setter_report() result not True ' s += 'with inputfile: %s ' % testsetterinputfile s += 'and outputfile: %s' % testsetterreportfile self.assertTrue(result, s) outputheader = read_header(testsetterreportfile) expected = ['ID', 'month', 'country'] s = 'outputheader: %s not as expected: %s' % (outputheader, expected) self.assertEqual(outputheader, expected, s) dialect = csv_file_dialect(testsetterreportfile) encoding = csv_file_encoding(testsetterreportfile) rows = read_rows(testsetterreportfile, 1, dialect=dialect, encoding=encoding, header=True, fieldnames=outputheader) firstrow = rows[0] field = 'country' value = firstrow[field] expected = 'Argentina' s = 'Field %s value %s not as expected (%s)' % (field, value, expected) self.assertEqual(value, expected, s)
def test_darwinize_header(self): print 'testing darwinize_header' testfile1 = self.framework.testfile1 testfile2 = self.framework.testfile2 testfile3 = self.framework.testfile3 testdatapath = self.framework.testdatapath dwccloudfile = self.framework.dwccloudfile outputfile = self.framework.outputfile inputs = {} inputs['inputfile'] = testfile1 inputs['dwccloudfile'] = dwccloudfile inputs['outputfile'] = outputfile inputs['workspace'] = testdatapath # Darwinize the header response = darwinize_header(inputs) outfilelocation = '%s/%s' % (testdatapath, outputfile) header = read_header(outfilelocation) #print 'inputs1:\n%s' % inputs #print 'response1:\n%s' % response expected = [ 'catalogNumber', 'recordedBy', 'fieldNumber', 'year', 'month', 'day', 'decimalLatitude', 'decimalLongitude', 'geodeticDatum', 'country', 'stateProvince', 'county', 'locality', 'family', 'scientificName', 'scientificNameAuthorship', 'reproductiveCondition', 'institutionCode', 'collectionCode', 'datasetName', 'Id' ] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, header, expected) self.assertEqual(header, expected, s) # What is not Darwin Core? casesensitive = True notdwc = terms_not_in_dwc(header, casesensitive) expected = ['Id'] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc, expected) self.assertEqual(notdwc, expected, s) inputs['inputfile'] = testfile2 # Darwinize the header response = darwinize_header(inputs) header = read_header(outfilelocation) #print 'response2:\n%s' % response expected = [ 'materialSampleID', 'principalInvestigator', 'locality', 'phylum', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'georeferenceProtocol', 'year', 'month', 'day', 'genus', 'specificEpithet', 'permitInformation', 'basisOfIdentification', 'taxonID', 'country', 'stateProvince', 'island', 'islandGroup', 'sampleOwnerInstitutionCode', 'fundingSource', 'occurrenceID', 'associatedMedia', 'associatedReferences', 'preservative', 'previousIdentifications', 'lifeStage', 'weight', 'length', 'sex', 'establishmentMeans', 'associatedSequences', 'occurrenceRemarks', 'habitat', 'microHabitat', 'substratum', 'samplingProtocol', 'minimumDepthInMeters', 'maximumDepthInMeters', 'minimumDistanceAboveSurfaceInMeters', 'maximumDistanceAboveSurfaceInMeters', 'associatedTaxa', 'fieldNotes', 'eventRemarks', 'recordedBy', 'identifiedBy', 'yearIdentified', 'monthIdentified', 'dayIdentified', 'class', 'order', 'family', 'infraspecificEpithet', 'vernacularName', 'taxonRemarks', 'geneticTissueType', 'plateID', 'wellID', 'extractionID', 'otherCatalogNumbers', 'tissueStorageID', 'BCID', 'UNNAMED_COLUMN_1' ] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile2, header, expected) self.assertEqual(header, expected, s) # What is not Darwin Core? casesensitive = True notdwc = terms_not_in_dwc(header, casesensitive) expected = [ 'BCID', 'UNNAMED_COLUMN_1', 'basisOfIdentification', 'dayIdentified', 'extractionID', 'fundingSource', 'geneticTissueType', 'length', 'microHabitat', 'monthIdentified', 'permitInformation', 'plateID', 'preservative', 'principalInvestigator', 'sampleOwnerInstitutionCode', 'substratum', 'tissueStorageID', 'weight', 'wellID', 'yearIdentified' ] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc, expected) self.assertEqual(notdwc, expected, s) inputs['inputfile'] = testfile3 # Darwinize the header response = darwinize_header(inputs) header = read_header(outfilelocation) #print 'response2:\n%s' % response expected = [ 'id', 'institutionCode', 'collectionCode', 'basisOfRecord', 'occurrenceID', 'catalogNumber', 'otherCatalogNumbers', 'kingdom', 'phylum', 'class', 'order', 'family', 'scientificName', 'scientificNameAuthorship', 'genus', 'specificEpithet', 'taxonRank', 'infraspecificEpithet', 'identifiedBy', 'dateIdentified', 'identificationReferences', 'identificationRemarks', 'taxonRemarks', 'identificationQualifier', 'typeStatus', 'recordedBy', 'recordedByID', 'associatedCollectors', 'recordNumber', 'eventDate', 'year', 'month', 'day', 'startDayOfYear', 'endDayOfYear', 'verbatimEventDate', 'occurrenceRemarks', 'habitat', 'substrate', 'verbatimAttributes', 'fieldNumber', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'associatedTaxa', 'reproductiveCondition', 'establishmentMeans', 'cultivationStatus', 'lifeStage', 'sex', 'individualCount', 'samplingProtocol', 'samplingEffort', 'preparations', 'country', 'stateProvince', 'county', 'municipality', 'locality', 'locationRemarks', 'localitySecurity', 'localitySecurityReason', 'decimalLatitude', 'decimalLongitude', 'geodeticDatum', 'coordinateUncertaintyInMeters', 'verbatimCoordinates', 'georeferencedBy', 'georeferenceProtocol', 'georeferenceSources', 'georeferenceVerificationStatus', 'georeferenceRemarks', 'minimumElevationInMeters', 'maximumElevationInMeters', 'minimumDepthInMeters', 'maximumDepthInMeters', 'verbatimDepth', 'verbatimElevation', 'disposition', 'language', 'recordEnteredBy', 'modified', 'sourcePrimaryKey', 'collId', 'recordId', 'references' ] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile3, header, expected) self.maxDiff = None self.assertEqual(header, expected, s) # What is not Darwin Core? casesensitive = True notdwc = terms_not_in_dwc(header, casesensitive) expected = [ 'associatedCollectors', 'collId', 'cultivationStatus', 'id', 'localitySecurity', 'localitySecurityReason', 'recordEnteredBy', 'recordId', 'recordedByID', 'sourcePrimaryKey', 'substrate', 'verbatimAttributes' ] s = 'From input: %s\nFound:\n%s\nExpected:\n%s' % (testfile1, notdwc, expected) self.assertEqual(notdwc, expected, s)
def test_headers(self): print 'testing headers' testfile = self.framework.testfile1 header = read_header(testfile) expected = [ 'catalogNumber ', 'recordedBy', 'fieldNumber ', 'year', 'month', 'day', 'decimalLatitude ', 'decimalLongitude ', 'geodeticDatum ', 'country', 'stateProvince', 'county', 'locality', 'family ', 'scientificName ', 'scientificNameAuthorship ', 'reproductiveCondition ', 'InstitutionCode ', 'CollectionCode ', 'DatasetName ', 'Id' ] s = 'test file %s header:\n%s does not match expected:\n%s' % ( testfile, header, expected) self.assertEqual(header, expected) testfile = self.framework.testfile2 header = read_header(testfile) expected = [ 'materialSampleID', 'principalInvestigator', 'locality', 'phylum', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'georeferenceProtocol', 'yearCollected', 'monthCollected', 'dayCollected', 'genus', 'species', 'permitInformation', 'basisOfIdentification', 'wormsID', 'country', 'stateProvince', 'island', 'islandGroup', 'sampleOwnerInstitutionCode', 'fundingSource', 'occurrenceID', 'associatedMedia', 'associatedReferences', 'preservative', 'previousIdentifications', 'lifeStage', 'weight', 'length', 'sex', 'establishmentMeans', 'associatedSequences', 'occurrenceRemarks', 'habitat', 'microHabitat', 'substratum', 'samplingProtocol', 'minimumDepthInMeters', 'maximumDepthInMeters', 'minimumDistanceAboveSurfaceInMeters', 'maximumDistanceAboveSurfaceInMeters', 'associatedTaxa', 'fieldNotes', 'eventRemarks', 'recordedBy', 'identifiedBy', 'yearIdentified', 'monthIdentified', 'dayIdentified', 'class', 'order', 'family', 'subSpecies', 'vernacularName', 'taxonRemarks', 'geneticTissueType', 'plateID', 'wellID', 'extractionID', 'previousTissueID', 'tissueStorageID', 'BCID', '' ] s = 'test file %s header:\n%s does not match expected:\n%s' % \ (testfile, header, expected) self.assertEqual(header, expected) testfile = self.framework.testfile3 header = read_header(testfile) expected = [ 'id', 'institutionCode', 'collectionCode', 'basisOfRecord', 'occurrenceID', 'catalogNumber', 'otherCatalogNumbers', 'kingdom', 'phylum', 'class', 'order', 'family', 'scientificName', 'scientificNameAuthorship', 'genus', 'specificEpithet', 'taxonRank', 'infraspecificEpithet', 'identifiedBy', 'dateIdentified', 'identificationReferences', 'identificationRemarks', 'taxonRemarks', 'identificationQualifier', 'typeStatus', 'recordedBy', 'recordedByID', 'associatedCollectors', 'recordNumber', 'eventDate', 'year', 'month', 'day', 'startDayOfYear', 'endDayOfYear', 'verbatimEventDate', 'occurrenceRemarks', 'habitat', 'substrate', 'verbatimAttributes', 'fieldNumber', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'associatedTaxa', 'reproductiveCondition', 'establishmentMeans', 'cultivationStatus', 'lifeStage', 'sex', 'individualCount', 'samplingProtocol', 'samplingEffort', 'preparations', 'country', 'stateProvince', 'county', 'municipality', 'locality', 'locationRemarks', 'localitySecurity', 'localitySecurityReason', 'decimalLatitude', 'decimalLongitude', 'geodeticDatum', 'coordinateUncertaintyInMeters', 'verbatimCoordinates', 'georeferencedBy', 'georeferenceProtocol', 'georeferenceSources', 'georeferenceVerificationStatus', 'georeferenceRemarks', 'minimumElevationInMeters', 'maximumElevationInMeters', 'minimumDepthInMeters', 'maximumDepthInMeters', 'verbatimDepth', 'verbatimElevation', 'disposition', 'language', 'recordEnteredBy', 'modified', 'sourcePrimaryKey', 'collId', 'recordId', 'references' ] s = 'test file %s header:\n%s does not match expected:\n%s' % ( testfile, header, expected) self.assertEqual(header, expected)
def test_aggregate_mix(self): print 'testing aggregate_mix' tsvfile = self.framework.tsvfile mixedcompositepath = self.framework.mixedcompositepath workspace = self.framework.testdatapath inputs = {} inputs['inputpath'] = mixedcompositepath inputs['outputfile'] = self.framework.tsvfile inputs['workspace'] = workspace inputs['format'] = 'txt' # Aggregate text file response = text_file_aggregator(inputs) outputfile = response['outputfile'] self.assertTrue(os.path.isfile(outputfile), outputfile + ' does not exist') self.assertEqual(response['aggregaterowcount'], 19, 'incorrect number of rows') header = read_header(outputfile) modelheader = [] modelheader.append('BCID') modelheader.append('CollectionCode') modelheader.append('DatasetName') modelheader.append('Id') modelheader.append('InstitutionCode') modelheader.append('associatedMedia') modelheader.append('associatedReferences') modelheader.append('associatedSequences') modelheader.append('associatedTaxa') modelheader.append('basisOfIdentification') modelheader.append('catalogNumber') modelheader.append('class') modelheader.append('coordinateUncertaintyInMeters') modelheader.append('country') modelheader.append('county') modelheader.append('day') modelheader.append('dayCollected') modelheader.append('dayIdentified') modelheader.append('decimalLatitude') modelheader.append('decimalLongitude') modelheader.append('establishmentMeans') modelheader.append('eventRemarks') modelheader.append('extractionID') modelheader.append('family') modelheader.append('fieldNotes') modelheader.append('fieldNumber') modelheader.append('fundingSource') modelheader.append('geneticTissueType') modelheader.append('genus') modelheader.append('geodeticDatum') modelheader.append('georeferenceProtocol') modelheader.append('habitat') modelheader.append('identifiedBy') modelheader.append('island') modelheader.append('islandGroup') modelheader.append('length') modelheader.append('lifeStage') modelheader.append('locality') modelheader.append('materialSampleID') modelheader.append('maximumDepthInMeters') modelheader.append('maximumDistanceAboveSurfaceInMeters') modelheader.append('microHabitat') modelheader.append('minimumDepthInMeters') modelheader.append('minimumDistanceAboveSurfaceInMeters') modelheader.append('month') modelheader.append('monthCollected') modelheader.append('monthIdentified') modelheader.append('occurrenceID') modelheader.append('occurrenceRemarks') modelheader.append('order') modelheader.append('permitInformation') modelheader.append('phylum') modelheader.append('plateID') modelheader.append('preservative') modelheader.append('previousIdentifications') modelheader.append('previousTissueID') modelheader.append('principalInvestigator') modelheader.append('recordedBy') modelheader.append('reproductiveCondition') modelheader.append('sampleOwnerInstitutionCode') modelheader.append('samplingProtocol') modelheader.append('scientificName') modelheader.append('scientificNameAuthorship') modelheader.append('sex') modelheader.append('species') modelheader.append('stateProvince') modelheader.append('subSpecies') modelheader.append('substratum') modelheader.append('taxonRemarks') modelheader.append('tissueStorageID') modelheader.append('vernacularName') modelheader.append('weight') modelheader.append('wellID') modelheader.append('wormsID') modelheader.append('year') modelheader.append('yearCollected') modelheader.append('yearIdentified') #print 'len(header)=%s len(model)=%s\nheader:\n%smodel:\n\n%s' % (len(header), len(modelheader), header, modelheader) self.assertEqual(len(header), 77, 'incorrect number of fields in header') self.assertEqual(header, modelheader, 'header not equal to the model header')