Пример #1
0
class Tokenize(StandardWorkflowComponent):

    config = Parameter()
    strip_punctuation = BoolParameter()
    lowercase = BoolParameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self,
                            format_id='filtered',
                            extension='.filtered.json'),
                InputComponent(self,
                               FilterTweets,
                               format_json=self.format_json))

    def autosetup(self):
        return Tokenize_instances
Пример #2
0
class ExtractCityref(StandardWorkflowComponent):

    citylist = Parameter()

    config = Parameter()
    strip_punctuation = BoolParameter()
    to_lowercase = BoolParameter()
    skip_date = BoolParameter()
    skip_month = BoolParameter()
    skip_timeunit = BoolParameter()
    skip_day = BoolParameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self,
                            format_id='dateref',
                            extension='.dateref.json'),
                InputComponent(self,
                               ExtractDateref,
                               config=self.config,
                               strip_punctuation=self.strip_punctuation,
                               to_lowercase=self.to_lowercase,
                               skip_datematch=self.skip_date,
                               skip_monthmatch=self.skip_month,
                               skip_timeunitmatch=self.skip_timeunit,
                               skip_daymatch=self.skip_day))

    def autosetup(self):
        return ExtractCityrefTask
Пример #3
0
class ExtractEntities(StandardWorkflowComponent):

    commonness_txt = Parameter()
    commonness_cls = Parameter()
    commonness_corpus = Parameter()
    ngrams_score = Parameter()

    config = Parameter()
    strip_punctuation = BoolParameter()
    to_lowercase = BoolParameter()
    skip_date = BoolParameter()
    skip_month = BoolParameter()
    skip_timeunit = BoolParameter()
    skip_day = BoolParameter()
    citylist = Parameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self, format_id='cityref', extension='.json'),
                InputComponent(self,
                               ExtractCityref,
                               config=self.config,
                               strip_punctuation=self.strip_punctuation,
                               to_lowercase=self.to_lowercase,
                               citylist=self.citylist,
                               skip_date=self.skip_date,
                               skip_month=self.skip_month,
                               skip_timeunit=self.skip_timeunit,
                               skip_day=self.skip_day))

    def autosetup(self):
        return ExtractEntitiesTask
Пример #4
0
class FilterTweets(StandardWorkflowComponent):

    format_json = BoolParameter()

    def autosetup(self):
        return FilterTweetsTask

    def accepts(self):
        return InputFormat(self, format_id='tweets', extension='.gz')
Пример #5
0
class Folia2txt(Task):
    executable = 'folia2txt' #external executable (None if n/a)

    sentenceperline = BoolParameter(default=False)
    paragraphperline = BoolParameter(default=False)
    retaintokenisation = BoolParameter(default=False)

    in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification

    def out_html(self):
        return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.txt')

    def run(self):
        self.ex(self.in_folia().path,
            o=self.out_html().path,
            s=self.sentenceperline,
            p=self.paragraphperline,
            t=self.retaintokenisation)
Пример #6
0
class UpdateEventTypesTask(Task):

    in_events = InputSlot()
    in_predictiondir = InputSlot()

    text = BoolParameter()

    def out_updated_events(self):
        return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.types.events.integrated')

    def run(self):

        # read prediction data
        with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in:
            meta = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in:
            predictions = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in:
            lines = file_in.read().strip().split('\n')
        label_order = lines[0].split('\t')
        full_predictions = [line.split('\t') for line in lines[1:]]

        print('Meta',len(meta))
        print('Predictions',len(predictions))
        print('Full predictions',len(full_predictions))
        
        # read in events
        print('Reading in events')
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed,txt=self.text)
            event_objs.append(eventobj)

        # index events
        id_event = {}
        for eo in event_objs:
            id_event[eo.mongo_id] = eo

        # for each prediction
        for i,mid in enumerate(meta):
            prediction = predictions[i]
            prediction_score = dict(zip(label_order,full_predictions[i]))
            eo = id_event[mid]
            eo.eventtype = prediction
            eo.eventtype_scores = prediction_score

        # write output
        out_updated_events = [event.return_dict(txt=self.text) for event in event_objs]
        with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_updated_events,file_out)
Пример #7
0
class Ucto(StandardWorkflowComponent):
    """A workflow component for Ucto"""

    skip = Parameter(
        default=""
    )  #A parameter for the workflow, will be passed on to the tasks

    language = Parameter()
    tok_input_sentenceperline = BoolParameter(default=False)
    tok_output_sentenceperline = BoolParameter(default=False)

    def autosetup(self):
        return (Ucto_txt2folia, Ucto_folia2folia, Ucto_tok2folia)

    def accepts(self):
        """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)"""
        return (InputFormat(self, format_id='folia', extension='folia.xml'),
                InputFormat(self, format_id='txt', extension='txt'),
                InputFormat(self, format_id='tok', extension='tok'),
                InputComponent(self, ConvertToFoLiA))
Пример #8
0
class Tokenize_instances(Task):
    """"Tokenizes a file one document per line"""

    in_filtered = InputSlot()

    config = Parameter()
    strip_punctuation = BoolParameter()
    lowercase = BoolParameter()

    def out_tokenized(self):
        return self.outputfrominput(inputformat='filtered',
                                    stripextension='.filtered.json',
                                    addextension='.tok.json')

    def run(self):

        print('Running Tokenizer...')

        with open(self.in_filtered().path, 'r', encoding='utf-8') as file_in:
            tweets = json.load(file_in)

        toktweets = []
        tokenizer = ucto.Tokenizer(self.config)
        for tweet in tweets:
            text = tweet['text']
            tokenizer.process(text)
            tokens = []
            for token in tokenizer:
                if not (self.strip_punctuation
                        and token.tokentype == 'PUNCTUATION'):
                    tokens.append(token.text)
            tokenized = ' '.join(tokens)
            if self.lowercase:
                tokenized = tokenized.lower()
            tweet['text'] = tokenized
            toktweets.append(tweet)

        # write to file
        with open(self.out_tokenized().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(toktweets, file_out)
Пример #9
0
class ExtractDaterefTask(Task):

    in_tokenized = InputSlot()

    skip_datematch = BoolParameter()
    skip_monthmatch = BoolParameter()
    skip_timeunitmatch = BoolParameter()
    skip_daymatch = BoolParameter()

    def out_dateref(self):
        return self.outputfrominput(inputformat='tokenized',
                                    stripextension='.tok.json',
                                    addextension='.dateref.json')

    def run(self):

        # read in tweets
        with open(self.in_tokenized().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract daterefs
        for tweetobj in tweets:
            dte = dutch_timex_extractor.Dutch_timex_extractor(
                tweetobj.text, tweetobj.datetime)
            dte.extract_refdates(self.skip_datematch, self.skip_monthmatch,
                                 self.skip_timeunitmatch, self.skip_daymatch)
            dte.filter_future_refdates()
            tweetobj.set_refdates(dte.refdates)

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_dateref().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Пример #10
0
class Ucto_txt2tok(Task):
    executable = 'ucto'  #external executable (None if n/a)

    #Parameters for this module (all mandatory!)
    language = Parameter()
    tok_input_sentenceperline = BoolParameter(default=False)
    tok_output_sentenceperline = BoolParameter(default=False)

    in_txt = InputSlot(
    )  #will be linked to an out_* slot of another module in the workflow specification

    def out_tok(self):
        return self.outputfrominput(inputformat='txt',
                                    stripextension='.txt',
                                    addextension='.tok')

    def run(self):
        self.ex(self.in_txt().path,
                self.out_tok().path,
                L=self.language,
                m=self.tok_input_sentenceperline,
                n=self.tok_output_sentenceperline)
Пример #11
0
class Ucto_dir(StandardWorkflowComponent):
    """A workflow component for Ucto that operates on entire directories"""

    skip = Parameter(
        default=""
    )  #A parameter for the workflow, will be passed on to the tasks

    language = Parameter()
    tok_input_sentenceperline = BoolParameter(default=False)
    tok_output_sentenceperline = BoolParameter(default=False)

    def autosetup(self):
        return (Ucto_txt2folia_dir, Ucto_folia2folia_dir)

    def accepts(self):
        """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)"""
        return (InputFormat(self,
                            format_id='txtdir',
                            extension='txtdir',
                            directory=True),
                InputFormat(self,
                            format_id='foliadir',
                            extension='foliadir',
                            directory=True))
Пример #12
0
class CollatePDF(Task):
    """Collate multiple PDF files together"""
    executable = 'pdftk'

    naturalsort = BoolParameter(default=True) #do a natural sort of all pdfs in the input directory

    in_pdfdir = InputSlot()

    def out_pdf(self):
        return self.outputfrominput(inputformat='pdfdir',stripextension='.pdfdir',addextension='.pdf')

    def run(self):
        pdf_files = [ pdffile for pdffile in glob.glob(self.in_pdfdir().path + '/*.pdf') ] #collect all pdf files in collection
        if self.naturalsort:
            pdf_files = natsort.natsorted(pdf_files)
        args = pdf_files + ['output',self.out_pdf().path]
        self.ex(*args)
Пример #13
0
class UpdateEventTypes(WorkflowComponent):

    events = Parameter()
    predictiondir = Parameter()

    text = BoolParameter()

    def accepts(self):
        return [ ( InputFormat(self,format_id='predictiondir',extension='.instances',inputparameter='predictiondir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ]

    def setup(self, workflow, input_feeds):

        event_type_updater = workflow.new_task('update_event_types', UpdateEventTypesTask, autopass=True, text=self.text)
        event_type_updater.in_events = input_feeds['events']
        event_type_updater.in_predictiondir = input_feeds['predictiondir']

        return event_type_updater
Пример #14
0
class Timbl_leaveoneout(Timbl_base):
    in_train = InputSlot()

    leaveoneout = BoolParameter(default=False)

    def out_log(self):
        return self.outputfrominput(inputformat='train',
                                    stripextension='.train',
                                    addextension='.timbl.leaveoneout.log')

    def run(self):
        self.ex(f=self.in_train().path,
                t="leave_one_out",
                a=self.algorithm,
                k=self.k,
                m=self.metric,
                w=self.weighting,
                d=self.distance,
                __stdout_to=self.out_log().path)
Пример #15
0
class FilterTweetsTask(Task):

    in_tweets = InputSlot() #input slot for a gzipped tweet file

    format_json = BoolParameter()

    def out_filtered(self):
        return self.outputfrominput(inputformat='tweets', stripextension='.gz', addextension='.filtered.json')

    def run(self):
        # read in gzipped tweet file
        good_format = re.compile(r'}$')
        tweets = []
        for line in io.TextIOWrapper(io.BufferedReader(gzip.open(self.in_tweets().path)), encoding='utf-8', errors='ignore'):
            if self.format_json:
                try:
                    tweets.append(json.loads(line.strip()))
                except:
                    print('Error loading json, skipping to next line')
            else:
                try:
                    tweets.append(line.strip())
                except:
                    print('Error loading json, skipping to next line')
        print(self.in_tweets().path,'contains',len(tweets),'before filtering')
        tf = tweetfilter.Tweetfilter(tweets)
        tf.discard_retweets()
        print('after retweet filter',len(tf.tweets))
        tf.discard_nondutch()
        filtered_tweets = tf.return_tweets()
        print('after filtering:',len(filtered_tweets))
        # write filtered tweets
        outtweets = []
        for filtered_tweet in filtered_tweets:
            tweetobj = tweet.Tweet()
            tweetobj.import_twiqsdict(filtered_tweet)
            outtweets.append(tweetobj.return_dict())
        # write to file
        with open(self.out_filtered().path,'w',encoding='utf-8') as outfile:
            json.dump(outtweets,outfile)
Пример #16
0
class ExtractDateref(StandardWorkflowComponent):

    skip_datematch = BoolParameter()
    skip_monthmatch = BoolParameter()
    skip_timeunitmatch = BoolParameter()
    skip_daymatch = BoolParameter()

    config = Parameter(default=False)
    strip_punctuation = BoolParameter()
    to_lowercase = BoolParameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self, format_id='tokenized', extension='tok.json'),
                InputComponent(self,
                               Tokenize,
                               config=self.config,
                               strip_punctuation=self.strip_punctuation,
                               lowercase=self.to_lowercase))

    def autosetup(self):
        return ExtractDaterefTask
Пример #17
0
class Frog_txt2folia(Task):
    """A task for Frog: Takes plaintext input and produces FoLiA output"""
    executable = 'frog' #external executable (None if n/a)

    #Parameters for this module (all mandatory!)
    tok_input_sentenceperline = BoolParameter(default=False)
    skip = Parameter(default="")

    in_txt = InputSlot() #input slot placeholder (will be linked to an out_* slot of another module in the workflow specification)

    def out_folia(self):
        """The output slot, for FoLiA"""
        return self.outputfrominput(inputformat='txt',stripextension='.txt', addextension='.frogged.folia.xml') #the format_id corresponds to the input slot (txt -> in_txt)

    def run(self):
        #execute a shell command, python keyword arguments will be passed as option flags (- for one letter, -- for more)
        # values will be made shell-safe.
        # None or False values will not be propagated at all.
        self.ex(
            t=self.in_txt().path, #the path of the input file  (accessed through the input slot)
            X=self.out_folia().path, #the path of the output file (accessed through the output slot)
            id=os.path.basename(self.in_txt().path).split('.')[0], #first component of input filename (up to first period) will be FoLiA ID
            skip=self.skip if self.skip else None,
            n=self.tok_input_sentenceperline)