Exemplo n.º 1
0
class Timbl_test(Timbl_base):
    in_ibase = InputSlot()  #input slot
    in_wgt = InputSlot()
    in_test = InputSlot()

    def out_timbl(self):
        return self.outputfrominput(inputformat='test',
                                    stripextension='.test',
                                    addextension='.timbl.out')

    def out_log(self):
        return self.outputfrominput(inputformat='test',
                                    stripextension='.test',
                                    addextension='.timbl.test.log')

    def run(self):
        self.ex(i=self.in_ibase().path,
                t=self.in_test().path,
                w=self.in_wgt().path + ':' + self.weighting,
                o=self.out_timbl().path,
                a=self.algorithm,
                k=self.k,
                m=self.metric,
                d=self.distance,
                __stdout_to=self.out_log().path)
Exemplo n.º 2
0
class UpdateEventTypesTask(Task):

    in_events = InputSlot()
    in_predictiondir = InputSlot()

    text = BoolParameter()

    def out_updated_events(self):
        return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.types.events.integrated')

    def run(self):

        # read prediction data
        with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in:
            meta = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in:
            predictions = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in:
            lines = file_in.read().strip().split('\n')
        label_order = lines[0].split('\t')
        full_predictions = [line.split('\t') for line in lines[1:]]

        print('Meta',len(meta))
        print('Predictions',len(predictions))
        print('Full predictions',len(full_predictions))
        
        # read in events
        print('Reading in events')
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed,txt=self.text)
            event_objs.append(eventobj)

        # index events
        id_event = {}
        for eo in event_objs:
            id_event[eo.mongo_id] = eo

        # for each prediction
        for i,mid in enumerate(meta):
            prediction = predictions[i]
            prediction_score = dict(zip(label_order,full_predictions[i]))
            eo = id_event[mid]
            eo.eventtype = prediction
            eo.eventtype_scores = prediction_score

        # write output
        out_updated_events = [event.return_dict(txt=self.text) for event in event_objs]
        with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_updated_events,file_out)
Exemplo n.º 3
0
class ArchiveEventsTask(Task):

    in_events = InputSlot()
    in_archivedir = InputSlot()

    archivedate = Parameter()

    def out_archived(self):
        return self.outputfrominput(inputformat='archivedir',
                                    stripextension='.archive',
                                    addextension='.archive/events_' +
                                    self.archivedate + '.json')

    def out_active_events(self):
        return self.outputfrominput(inputformat='events',
                                    stripextension='.events.integrated',
                                    addextension='.active.events.integrated')

    def run(self):

        # read events
        archive_events = []
        active_events = []
        date = datetime.datetime(int(self.archivedate[:4]),
                                 int(self.archivedate[4:6]),
                                 int(self.archivedate[6:8]))
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime == date:
                    archive_events.append(eventobj)
                else:
                    active_events.append(eventobj)

        # write archive
        print('Writing archive')
        out_archive_events = [
            event.return_dict(txt=False) for event in archive_events
        ]
        with open(self.out_archived().path, 'w', encoding='utf-8') as file_out:
            json.dump(out_archive_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
Exemplo n.º 4
0
class LowercaseVoweleaterDirTask2(Task):
    in_txtdir = InputSlot()
    extension = Parameter(default='txt')

    def out_txtdir(self):
        return self.outputfrominput(inputformat='txtdir',
                                    stripextension='.txtdir',
                                    addextension='.lcnv.txtdir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_txtdir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_txtdir().path + '/*.' +
                                               self.extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the OCR_singlepage component for each input file in the directory
        yield [
            Voweleater(inputfile=inputfile,
                       outputdir=self.out_txtdir().path,
                       startcomponent='Lowercaser') for inputfile in inputfiles
        ]
Exemplo n.º 5
0
class Timbl_train(Timbl_base):
    in_train = InputSlot()  #input slot

    def out_ibase(self):
        return self.outputfrominput(inputformat='train',
                                    stripextension='.train',
                                    addextension='.ibase')

    def out_wgt(self):
        return self.outputfrominput(inputformat='train',
                                    stripextension='.train',
                                    addextension='.wgt')

    def out_log(self):
        return self.outputfrominput(inputformat='train',
                                    stripextension='.train',
                                    addextension='.timbl.train.log')

    def run(self):
        self.ex(f=self.in_train().path,
                I=self.out_ibase().path,
                W=self.out_wgt().path,
                a=self.algorithm,
                k=self.k,
                m=self.metric,
                w=self.weighting,
                d=self.distance,
                __stdout_to=self.out_log().path)
Exemplo n.º 6
0
class Ucto_folia2folia_dir(Task):
    extension = Parameter(default="folia.xml")
    language = Parameter()

    in_foliadir = InputSlot()  #input slot

    def out_tokfoliadir(self):
        return self.outputfrominput(inputformat='foliadir',
                                    stripextension='.foliadir',
                                    addextension='.tok.foliadir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_tokfoliadir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_foliadir().path +
                                               '/*.' + self.extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the FeaturizerTask_single component for each input file in the directory
        yield [
            Ucto(inputfile=inputfile,
                 inputslot='folia',
                 outputdir=self.out_tokfoliadir().path,
                 language=self.language) for inputfile in inputfiles
        ]
Exemplo n.º 7
0
class TesseractOCR_document(Task):
    """OCR for a whole document (input is a directory of tiff image files (pages), output is a directory of hOCR files"""
    tiff_extension = Parameter(default='tif')
    language = Parameter()

    in_tiffdir = InputSlot()  #input slot

    def out_hocrdir(self):
        return self.outputfrominput(inputformat='tiffdir',
                                    stripextension='.tiffdir',
                                    addextension='.hocrdir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_hocrdir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_tiffdir().path + '/*.' +
                                               self.tiff_extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the OCR_singlepage component for each input file in the directory
        yield [
            OCR_singlepage(inputfile=inputfile,
                           outputdir=self.out_hocrdir().path,
                           language=self.language,
                           tiff_extension=self.tiff_extension)
            for inputfile in inputfiles
        ]
Exemplo n.º 8
0
class ScaleTestTask(Task):

    in_txtdir = InputSlot()
    n = IntParameter()

    def out_txtdir(self):
        return self.outputfrominput(inputformat='txtdir',
                                    stripextension='.txtdir',
                                    addextension='.out.txtdir')

    def run(self):
        self.setup_output_dir(self.out_txtdir().path)

        #gather input files
        log.info("Collecting input files...")
        inputfiles = [
            os.path.join(self.in_txtdir().path,
                         str(i) + '.txt') for i in range(1, self.n + 1)
        ]
        log.info("Collected " + str(len(inputfiles)) + " input files")

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        for inputfiles_chunk in chunk(inputfiles, 1000):
            yield ParallelBatch(component='Voweleater',
                                inputfiles=','.join(inputfiles_chunk),
                                passparameters=PassParameters(
                                    outputdir=self.out_txtdir().path))
Exemplo n.º 9
0
class IndexAllTweetsTask(Task):

    in_tweetdir = InputSlot()

    def out_indexed_tweets(self):
        return self.outputfrominput(inputformat='tweetdir',
                                    stripextension='.tweets',
                                    addextension='.tweets_indexed.json')

    def run(self):

        # read in tweets
        indexed_tweets = {}
        tweetsubdirs = sorted(
            [subdir for subdir in glob.glob(self.in_tweetdir().path + '/*')])
        for tweetsubdir in tweetsubdirs:
            print(tweetsubdir)
            # go through all tweet files
            tweetfiles = [
                tweetfile
                for tweetfile in glob.glob(tweetsubdir + '/*.entity.json')
            ]
            for tweetfile in tweetfiles:
                tweetfilestr = '/'.join(tweetfile.split('/')[-2:])
                # read in tweets
                with open(tweetfile, 'r', encoding='utf-8') as file_in:
                    tweetdicts = json.loads(file_in.read())
                for i, td in enumerate(tweetdicts):
                    indexed_tweets[td['id']] = [tweetfilestr, i]

        # write to file
        with open(self.out_indexed_tweets().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(indexed_tweets, file_out)
Exemplo n.º 10
0
class Folia2html(Task):
    executable = 'folia2html' #external executable (None if n/a)

    in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_html(self):
        return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.html')

    def run(self):
        self.ex(self.in_folia().path,
            o=self.out_html().path)
Exemplo n.º 11
0
class Rst2folia(Task):
    executable = 'rst2folia' #external executable (None if n/a)

    in_rst = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_folia(self):
        return self.outputfrominput(inputformat='rst',stripextension='.rst', addextension='.folia.xml')

    def run(self):
        self.ex(self.in_rst().path, self.out_folia().path,
            docid=os.path.basename(self.in_rst().path).split('.')[0]) #first component of input filename (up to first period) will be FoLiA ID
Exemplo n.º 12
0
class ArchiveEventsTask(Task):

    in_events = InputSlot()

    def out_archivedir(self):
        return self.outputfrominput(inputformat='events',
                                    stripextension='.events',
                                    addextension='.archive')

    def out_active_events(self):
        return self.outputfrominput(inputformat='events',
                                    stripextension='.events',
                                    addextension='.active.events')

    def run(self):

        # initiate directory
        self.setup_output_dir(self.out_archivedir().path)

        # read events
        datebound = datetime.now() - datetime.timedelta(days=100)
        date_events = defaultdict(list)
        active_events = []
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime < datebound:
                    date_events[''.join(
                        str(eventobj.datetime).split()[0].split('-'))].append(
                            eventobj)
                else:
                    active_events.append(eventobj)

        # write archives
        print('Writing archives')
        for date in sorted(list(date_events.keys())):
            print(date)
            events = date_events[date]
            out_events = [event.return_dict(txt=False) for event in events]
            outfile = self.out_archivedir().path + '/events_' + date + '.json'
            with open(outfile, 'w', encoding='utf-8') as file_out:
                json.dump(out_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
Exemplo n.º 13
0
class ExtractEntitiesTask(Task):

    in_cityref = InputSlot()

    commonness_txt = Parameter()
    commonness_cls = Parameter()
    commonness_corpus = Parameter()
    ngrams_score = Parameter()

    def out_entity(self):
        return self.outputfrominput(inputformat='cityref',
                                    stripextension='.json',
                                    addextension='.entity.json')

    def run(self):

        # set commonness object
        cs = commonness.Commonness()
        cs.set_classencoder(self.commonness_txt, self.commonness_cls,
                            self.commonness_corpus)
        cs.set_dmodel(self.ngrams_score)

        # read in tweets
        with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract entities
        for tweetobj in tweets:
            # remove already extracted time and locations from the tweet, forming it into chunks
            datestrings = [sr[0] for sr in tweetobj.string_refdates]
            cities = tweetobj.cityrefs
            tweet_chunks = helpers.remove_pattern_from_string(
                tweetobj.text, datestrings + cities)
            # find entities in every chunk
            ee = entity_extractor.EntityExtractor()
            ee.set_commonness(cs)
            for chunk in tweet_chunks:
                tokens = chunk.split()
                ee.extract_entities(tokens)
                ee.filter_entities_threshold()
            tweetobj.set_entities(ee.entities)

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_entity().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Exemplo n.º 14
0
class Alpino2folia(Task):
    executable = 'alpino2folia'

    in_alpinodocdir = InputSlot()

    def out_folia(self):
        return self.outputfrominput(inputformat='alpinodocdir',stripextension='.alpinodocdir', addextension='.folia.xml')

    def run(self):
        alpinofiles = [ alpinofile for alpinofile in sorted(glob.glob(self.in_alpinodocdir().path + '/*.xml'),key=lambda x: int(os.path.basename(x).split('.')[0])) ] #collect all alpino files in collection
        args = alpinofiles + [self.out_folia().path] #last argument is folia output
        self.ex(*args)
Exemplo n.º 15
0
class IntegrateEventDirTask(Task):

    ### task to speed up event integration for sliding window event extraction
    ### make sure that all events in the directory are deduplicated and enhanced before running this task
    ### only files with extension '.enhanced' will be integrated

    in_eventdir = InputSlot()

    overlap_threshold = Parameter()

    def out_integrated_events(self):
        return self.outputfrominput(inputformat='eventdir', stripextension='.events', addextension='events.integrated')

    def run(self):

        # collect all event files with extension '.enhanced'
        enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced')

        # initialize
        merger = event_merger.EventMerger()
        overlap_threshold = float(self.overlap_threshold)

        # for each event file
        for eventfile in enhanced_events:
            print('Reading',eventfile)
            with open(eventfile, 'r', encoding = 'utf-8') as file_in:
                current_eventdicts = json.loads(file_in.read())
            new_event_objs = []
            for ed in current_eventdicts:
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                new_event_objs.append(eventobj)
            # merge before integration
            print('Merging new events before integration; number of events at start:',len(new_event_objs))
            premerger = event_merger.EventMerger()
            premerger.add_events(new_event_objs)
            premerger.find_merges(overlap_threshold)
            new_events_merged = premerger.return_events()
            print('Done. New events after merge:',len(new_events_merged))
            if len(merger.events) == 0:
                merger.add_events(new_events_merged)
            else:
                # integrate each event into the current ones
                print('Starting integrating new events; number of current events:',len(merger.events))
                for new_event in new_events_merged:
                    merger.find_merge(new_event,overlap_threshold)            

        # write merged 
        integrated_events = merger.return_events()
        print('Done. Number of events after integration:',len(integrated_events))
        out_integrated_events = [event.return_dict() for event in integrated_events]
        with open(self.out_integrated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_integrated_events,file_out)
Exemplo n.º 16
0
class Foliacat(Task):
    executable = 'foliacat'

    extension = Parameter(default='folia.xml')

    in_foliadir = InputSlot()

    def out_folia(self):
        return self.outputfrominput(inputformat='foliadir',stripextension='.foliadir', addextension='.folia.xml')

    def run(self):
        foliafiles = [ filename for filename in natsort.natsorted(glob.glob(self.in_foliadir().path + '/*.' + self.extension)) ]
        self.ex(*foliafiles,
                o=self.out_folia().path,
                i=self.out_folia().path.split('.')[0]) #first component of filename acts as document ID
Exemplo n.º 17
0
class VoweleaterTask(Task):
    """Example of a task that invokes an external tool and uses stdin and stdout. This one simply removes vowels from a text."""
    executable = 'sed'
    in_txt = InputSlot()
    encoding = Parameter(default='utf-8')

    def out_txt(self):
        return self.outputfrominput(inputformat='txt',
                                    stripextension='.txt',
                                    addextension='.novowels.txt')

    def run(self):
        self.ex(e='s/[aeiouAEIOU]//g',
                __stdin_from=self.in_txt().path,
                __stdout_to=self.out_txt().path)
Exemplo n.º 18
0
class PrepareInstancesIdsTask(Task):

    in_events = InputSlot()

    def out_instances(self):
        return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.events.instances')

    def run(self):

        # initiate directory with instances
        self.setup_output_dir(self.out_instances().path)

        # read in events
        print('Reading in events')
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())

        # extract information
        print('Extracting text')
        ids = []
        txt = []
        counter = list(range(0,len(eventdicts),1000))
        for i,ed in enumerate(eventdicts):
            if i in counter:
                print('Event',i,'of',len(eventdicts))
            tweetstxt = []
            for tweettext in [' '.join([tweet['user'],tweet['text']]) for tweet in ed['tweets']] + [' '.join([tweet['user'],tweet['text']]) for tweet in ed['tweets_added']]:
                if re.search('http',tweettext):
                    tokens = tweettext.split()
                    for j,token in enumerate(tokens):
                        if token[:4] == 'http':
                            tokens[j] = 'THISISATWITTERLINK'
                    tweetstxt.append(' '.join(tokens).replace('\n',' ').replace('\r',' '))
                else:
                    tweetstxt.append(tweettext.replace('\n',' ').replace('\r',' '))                
            if ' '.join(tweetstxt).strip() == '':
                continue
            else:
                ids.append(ed['mongo_id'])
                txt.append(' '.join(tweetstxt))
 
        # write data
        print('Done. Writing to files')
        with open(self.out_instances().path + '/events_meta.txt','w',encoding='utf-8') as out:
            out.write('\n'.join(ids))

        with open(self.out_instances().path + '/events_text.txt','w',encoding='utf-8') as out:
            out.write('\n'.join(txt))
Exemplo n.º 19
0
class LowercaseTask(Task):
    """A simple task, implemented in python"""

    in_txt = InputSlot()
    encoding = Parameter(default='utf-8')

    def out_txt(self):
        return self.outputfrominput(inputformat='txt',
                                    stripextension='.txt',
                                    addextension='.lowercase.txt')

    def run(self):
        with open(self.in_txt().path, 'r', encoding=self.encoding) as f_in:
            with open(self.out_txt().path, 'w',
                      encoding=self.encoding) as f_out:
                f_out.write(f_in.read().lower())
Exemplo n.º 20
0
class Frog_folia2folia(Task):
    executable = 'frog' #external executable (None if n/a)

    #Parameters for this module (all mandatory!)
    skip = Parameter(default="")

    in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_folia(self):
        return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.frogged.folia.xml')

    def run(self):
        self.ex(
            x=self.in_folia().path,
            X=self.out_folia().path,
            skip=self.skip if self.skip else None)
Exemplo n.º 21
0
class FoliaHOCR(Task):
    """Converts a directory of hocr files to a directory of FoLiA files"""
    executable = "FoLiA-hocr"

    threads = Parameter(default=1)

    in_hocrdir = InputSlot()

    def out_foliadir(self):
        """Directory of FoLiA document, one per hOCR file"""
        return self.outputfrominput(inputformat='hocrdir',stripextension='.hocrdir', addextension='.foliadir')

    def run(self):
        self.setup_output_dir(self.out_foliadir().path)
        self.ex(self.in_hocrdir().path,
                t=self.threads,
                O=self.out_foliadir().path)
Exemplo n.º 22
0
class CollatePDF(Task):
    """Collate multiple PDF files together"""
    executable = 'pdftk'

    naturalsort = BoolParameter(default=True) #do a natural sort of all pdfs in the input directory

    in_pdfdir = InputSlot()

    def out_pdf(self):
        return self.outputfrominput(inputformat='pdfdir',stripextension='.pdfdir',addextension='.pdf')

    def run(self):
        pdf_files = [ pdffile for pdffile in glob.glob(self.in_pdfdir().path + '/*.pdf') ] #collect all pdf files in collection
        if self.naturalsort:
            pdf_files = natsort.natsorted(pdf_files)
        args = pdf_files + ['output',self.out_pdf().path]
        self.ex(*args)
Exemplo n.º 23
0
class Pdf2images(Task):
    """Extract images from a PDF document to a set of TIFF images"""
    executable = 'pdfimages' #external executable (None if n/a)

    in_pdf = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_tiffdir(self):
        return self.outputfrominput(inputformat='pdf',stripextension='.pdf',addextension='.tiffdir')

    def run(self):
        #we use a DirectoryHandler that takes care of creating a temporary directory to hold all output and renames it to the final directory when all succeeds, and cleaning up otherwise
        with DirectoryHandler(self.out_tiffdir().path) as dirhandler:
            self.ex(self.in_pdf().path, dirhandler.directory+'/' + os.path.basename(self.in_pdf().path).split('.')[0] , #output to temporary directory and a file prefix
                tiff=True,
                p=True,
                __singlehyphen=True, #use single-hypens even for multi-letter options
            )
Exemplo n.º 24
0
class FilterEventsTask(Task):

    in_merged_events = InputSlot()

    citylist = Parameter()

    def out_filtered_events(self):
        return self.outputfrominput(inputformat='merged_events',
                                    stripextension='.merged',
                                    addextension='.filtered')

    def run(self):

        # read in events
        print('Reading in events')
        with open(self.in_merged_events().path, 'r',
                  encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            event_objs.append(eventobj)

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # initialize event filter
        print('Filtering; number of events at start:', len(event_objs))
        filter = event_filter.EventFilter()
        filter.add_events(event_objs)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. number of events after filter:', len(events_filtered))

        # write filter
        out_filtered_events = [
            event.return_dict(txt=False) for event in events_filtered
        ]
        with open(self.out_filtered_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_filtered_events, file_out)
Exemplo n.º 25
0
class ExtractCityrefTask(Task):

    in_dateref = InputSlot()

    citylist = Parameter()

    def out_cityref(self):
        return self.outputfrominput(inputformat='dateref',
                                    stripextension='.json',
                                    addextension='.cityref.json')

    def run(self):

        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # read in tweets
        with open(self.in_dateref().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract location
        for tweetobj in tweets:
            # remove already extracted time from the tweet, forming it into chunks
            datestrings = [sr[0] for sr in tweetobj.string_refdates]
            tweet_chunks = helpers.remove_pattern_from_string(
                tweetobj.text, datestrings)
            # extract city from chunks
            ce = cityref_extractor.CityrefExtractor(citylist)
            for chunk in tweet_chunks:
                ce.find_cityrefs(chunk)
            tweetobj.set_cityrefs(ce.return_cityrefs())

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_cityref().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Exemplo n.º 26
0
class FoliaValidatorDirTask(Task):
    executable = "foliavalidator"
    in_foliadir = InputSlot()
    folia_extension = Parameter(default='folia.xml')

    def out_validationsummary(self):
        return self.outputfrominput(inputformat='foliadir',stripextension='.foliadir', addextension='.folia-validation-summary.txt')


    def run(self):
        #gather input files
        if self.outputdir and not os.path.exists(self.outputdir): os.makedirs(self.outputdir)

        log.info("Collecting input files...")
        inputfiles = recursive_glob(self.in_foliadir().path, '*.' + self.folia_extension)
        log.info("Collected " + str(len(inputfiles)) + " input files")

        log.info("Scheduling validators")
        if self.outputdir:
            passparameters = PassParameters(folia_extension=self.folia_extension,replaceinputdir=self.in_foliadir().path, outputdir=self.outputdir)
        else:
            passparameters = PassParameters(folia_extension=self.folia_extension)

        yield [ FoliaValidator(inputfile=inputfile,passparameters=passparameters) for inputfile in inputfiles ]

        log.info("Collecting output files...")
        #Gather all output files
        if self.outputdir:
            outputfiles = recursive_glob(self.outputdir, '*.folia-validation-report.txt')
        else:
            outputfiles = recursive_glob(self.in_foliadir().path, '*.folia-validation-report.txt')

        log.info("Writing summary")
        with open(self.out_validationsummary().path,'w',encoding='utf-8') as f_summary:
            for outputfilename in outputfiles:
                with open(outputfilename, 'r',encoding='utf-8') as f:
                    success = False
                    for line in f:
                        if line.startswith('Validated successfully'):
                            success = True
                            break
                if success:
                    f_summary.write(outputfilename + ": OK\n")
                else:
                    f_summary.write(outputfilename + ": ERROR\n")
Exemplo n.º 27
0
class CombineEntityTimeseriesTask(Task):

    in_entity_counts_dir = InputSlot()

    def out_combined_counts(self):
        return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts.npz')

    def out_combined_vocabulary(self):
        return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts_vocabulary')

    def out_combined_dateseries(self):
        return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts_dates')

    def run(self):

        # read entity counts
        print('Reading countfiles')
        countfiles = sorted([countfile for countfile in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts.npz')])
        vocabularies = sorted([vocabulary for vocabulary in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts_vocabulary')])
        datefiles = sorted([datesequence for datesequence in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts_dates')])
        print(len(countfiles),'Countfiles and',len(vocabularies),'Vocabulary files and',len(datefiles),'datefiles')
        dates = []
        counts = []
        for j,countfile in enumerate(countfiles):
            print(countfile)
            with open(datefiles[j],'r',encoding='utf-8') as file_in:
                dates.extend(file_in.read().strip().split('\n'))
            loader = numpy.load(countfile)
            counts.append(sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']))
        with open(vocabularies[j],'r',encoding='utf-8') as file_in:
            vocabulary = file_in.read().strip().split('\n')
        print('Done. Vocabulary size:',len(vocabulary),'Num dates:',len(dates),'Shape first counts:',counts[0].shape)

        # combine counts
        print('Combining counts')
        counts_combined = sparse.hstack(counts).tocsr()
        print('COMBINED SHAPE',counts_combined.shape)

        # write to files
        print('Writing to files')
        with open(self.out_combined_vocabulary().path,'w',encoding='utf-8') as out:
            out.write('\n'.join(vocabulary))
        with open(self.out_combined_dateseries().path,'w',encoding='utf-8') as out:
            out.write('\n'.join(dates))
        numpy.savez(self.out_combined_counts().path, data=counts_combined.data, indices=counts_combined.indices, indptr=counts_combined.indptr, shape=counts_combined.shape)        
Exemplo n.º 28
0
class FoliaValidatorTask(Task):
    executable = "foliavalidator"
    folia_extension = Parameter(default='folia.xml')

    in_folia = InputSlot()

    def out_validator(self):
        return self.outputfrominput(inputformat='folia',stripextension=self.folia_extension, addextension='.folia-validation-report.txt')

    def run(self):
        #If an explicit outputdir is given, ensure the directory for the output file exists (including any intermediate directories)
        if self.outputdir:
            self.setup_output_dir(os.path.dirname(self.out_validator().path))

        #Run the validator
        self.ex(self.in_folia().path,
            __stderr_to=self.out_validator().path,
            __ignorefailure=True) #if the validator fails (it does when the document is invalid),  we ignore it as that is a valid result for us
Exemplo n.º 29
0
class Folia2txt(Task):
    executable = 'folia2txt' #external executable (None if n/a)

    sentenceperline = BoolParameter(default=False)
    paragraphperline = BoolParameter(default=False)
    retaintokenisation = BoolParameter(default=False)

    in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_html(self):
        return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.txt')

    def run(self):
        self.ex(self.in_folia().path,
            o=self.out_html().path,
            s=self.sentenceperline,
            p=self.paragraphperline,
            t=self.retaintokenisation)
Exemplo n.º 30
0
class Symlink(Task):
    """Create a symlink"""

    filename = Parameter()
    stripextension = Parameter()
    addextension = Parameter()

    in_file = InputSlot()  #input slot

    def out_file(self):
        if self.filename:
            return TargetInfo(self, self.filename)
        else:
            return self.outputfrominput(inputformat='file',
                                        stripextension=self.stripextension,
                                        addextension=self.addextension)

    def run(self):
        os.symlink(self.in_file.path(), self.out_file.path())