예제 #1
0
 def output(self):
     timestamp = self.timestamp.isoformat()
     timestamp = timestamp.replace(':', '-')
     file_prefix = os.path.splitext(os.path.basename(self.input_file))[0]
     return state_file(self.timestamp,
                       'warcs2cdx',
                       '%s-submitted-%s.txt' % (file_prefix, timestamp),
                       on_hdfs=True)
예제 #2
0
 def output(self):
   return state_file(self.date,'hdfs', 'block-scanner-reports.json')
예제 #3
0
 def _state_file(self, state_date, ext):
     return state_file(state_date,self.tag,'%s.%s' % (self.name, ext), on_hdfs=self.on_hdfs)
예제 #4
0
 def output(self):
     return state_file(self.date, 'hdfs', 'duplicate-files-list.tsv')
예제 #5
0
 def output(self):
     return state_file(self.date, 'warc', 'warc-filesets.txt')
예제 #6
0
 def output(self):
     return state_file(None,
                       'access-hdfs',
                       'all-files-list.csv',
                       on_hdfs=False)
예제 #7
0
 def output(self):
     return {
         'owb': state_file(self.date,'access-data', 'access-whitelist-beta.txt'),
         'pywb': state_file(self.date,'access-data', 'access-whitelist-beta.aclj')
     }
예제 #8
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'all.json')
예제 #9
0
파일: listings.py 프로젝트: ukwa/ukwa-tasks
 def output(self):
     return state_file(self.date, 'hdfs',
                       'warc-%s-duplicate-files-list.tsv' % self.collection)
예제 #10
0
파일: listings.py 프로젝트: ukwa/ukwa-tasks
 def output(self):
     return state_file(self.date, 'hdfs',
                       'ukwa-%s-files-list.csv' % self.subset)
예제 #11
0
파일: listings.py 프로젝트: ukwa/ukwa-tasks
 def output(self):
     return state_file(self.date, 'hdfs', 'warc-ukwa-files-list.csv')
예제 #12
0
파일: listings.py 프로젝트: ukwa/ukwa-tasks
 def output(self):
     return state_file(self.date,
                       'hdfs',
                       'all-files-list.csv.gz',
                       on_hdfs=True)
예제 #13
0
파일: search.py 프로젝트: ukwa/ukwa-manage
 def output(self):
     return state_file(self.date,'access-data', 'updated-collections-solr.json')
예제 #14
0
파일: search.py 프로젝트: ukwa/ukwa-manage
 def output(self):
     return state_file(self.date,'access-data', 'indexer-annotations.json')
예제 #15
0
파일: search.py 프로젝트: ukwa/ukwa-manage
 def output(self):
     logger.warning('in output')
     return state_file(self.date,'access-data', 'title-level-metadata-w3act.xml')
예제 #16
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'db-csv.zip')
예제 #17
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'db-csv.zip', on_hdfs=True)
예제 #18
0
파일: listings.py 프로젝트: ukwa/ukwa-tasks
 def state_file(self, state_date, ext='csv'):
     return state_file(state_date,
                       'hdfs',
                       'all-files-list.%s' % ext,
                       on_hdfs=False)
예제 #19
0
 def output(self):
     return state_file(self.date,'access-data', 'access-whitelist-updated.txt')
예제 #20
0
 def output(self):
     return state_file(
         self.date, 'w3act-csv',
         'crawl-feed-%s.%s.json' % (self.feed, self.frequency))
예제 #21
0
 def output(self):
     return state_file(self.target_date, 'warcs',
                       '%s-warc-files-for-date.txt' % self.file_count)
예제 #22
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'crawl-feed-but-all-oa.json')
예제 #23
0
 def dated_state_file(self):
     return state_file(self.date,
                       'access-hdfs',
                       'all-files-list.csv.gz',
                       on_hdfs=False)
예제 #24
0
 def output(self):
     return state_file(self.date, 'w3act-collections', 'collections.json')
예제 #25
0
 def output(self):
     return state_file(self.date, 'hdfs', 'empty-files-list.csv')
예제 #26
0
 def output(self):
     return state_file(self.date, 'w3act-subjects', 'subject-list.json')
예제 #27
0
 def output(self):
     return state_file(self.date, 'hdfs', 'crawl-file-lists.txt')
예제 #28
0
 def output(self):
     return state_file(self.date, 'w3act-target-list', 'target-list.json')
예제 #29
0
 def output(self):
     return state_file(self.date, 'w3act-target-list',
                       'target-list-%s.json' % self.frequency)
예제 #30
0
 def output(self):
     return state_file(self.date, 'access-data',
                       'title-level-metadata-w3act.xml')