示例#1
0
    def init_y_annotations(self):
        """
        initialises the joint y vector with data from manually annotated abstracts
        """
        logging.info("Identifying seed data from annotated data")
        p = progressbar.ProgressBar(len(self.biviewer), timer=True)
        annotation_viewer = LabeledAbstractReader()

        counter = 0
        for study in annotation_viewer:
            study_id = int(study["Biview_id"])
            text = swap_num(annotation_viewer.get_biview_id(study_id)['abstract'])                

            parsed_tags = tag_words(text, flatten=True)
            tagged_number = [w[0] for w in parsed_tags if 'n' in w[1]]
            if tagged_number:
                number = re.match("[Nn]?=?([1-9]+[0-9]*)", tagged_number[0])

                if number:
                    self.data["y_lookup_init"][study_id] = int(number.group(1))
                    counter += 1
                else:
                    raise TypeError('Unable to convert tagged number %s to integer', tagged_number[0])

        self.seed_abstracts = counter
        logging.info("%d seed abstracts found", counter)
示例#2
0
 def __init__(self, text):
     self.text = text
     self.functions = [[{
         "w": word
     } for word in self.word_tokenize(sent)]
                       for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()
示例#3
0
    def init_y_annotations(self):
        """
        initialises the joint y vector with data from manually annotated abstracts
        """
        logging.info("Identifying seed data from annotated data")
        p = progressbar.ProgressBar(len(self.biviewer), timer=True)
        annotation_viewer = LabeledAbstractReader()

        counter = 0
        for study in annotation_viewer:
            study_id = int(study["Biview_id"])
            text = swap_num(
                annotation_viewer.get_biview_id(study_id)['abstract'])

            parsed_tags = tag_words(text, flatten=True)
            tagged_number = [w[0] for w in parsed_tags if 'n' in w[1]]
            if tagged_number:
                number = re.match("[Nn]?=?([1-9]+[0-9]*)", tagged_number[0])

                if number:
                    self.data["y_lookup_init"][study_id] = int(number.group(1))
                    counter += 1
                else:
                    raise TypeError(
                        'Unable to convert tagged number %s to integer',
                        tagged_number[0])

        self.seed_abstracts = counter
        logging.info("%d seed abstracts found", counter)
示例#4
0
 def __init__(self, text):
     self.functions = [[{
         "w": word,
         "p": pos
     } for word, pos in pos_tagger.tag(self.word_tokenize(sent))]
                       for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()
     self.text = text
 def __init__(self, text, window_size):
     self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text)
     self.functions = [[{
         "w": word,
         "p": pos
     } for word, pos in pos_tagger.tag(self.word_tokenize(sent))]
                       for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()
     self.w_pos_window = window_size
     self.text = text
示例#6
0
    def seed_y_regex(self, annotation_viewer):
        """
        initialises the joint y vector with data from manually annotated abstracts
        filter_ids = ids of the MergedTaggedAbstractReader to pay attention to
        """

        self.initialise()

        self.annotation_viewer_to_biviewer = {}
        self.answers = {}

        self.annotation_viewer = annotation_viewer

        logging.info("Generating answers for test set")
        
        p = progressbar.ProgressBar(len(self.annotation_viewer), timer=True)


        for study in range(len(self.annotation_viewer)):
            p.tap()

            biview_id = annotation_viewer[study]["biview_id"]
            self.annotation_viewer_to_biviewer[study] = biview_id

            # set answers
            parsed_tags = [item for sublist in annotation_viewer.get(study) for item in sublist] # flatten list
            tagged_numbers = [w[0] for w in parsed_tags if 'n' in w[1]] # then get any tagged numbers

            if tagged_numbers:
                number = int(tagged_numbers[0])
            else:
                number = -2

            self.answers[biview_id] = number


        logging.info("Generating seed data from regular expression")

        p = progressbar.ProgressBar(len(self.biviewer), timer=True)
        counter = 0  # number of studies initially found
        for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer):
            p.tap()
            pubmed_text = pubmed_dict.get("abstract", "")
            # use simple rule to identify population sizes (low sens/recall, high spec/precision)
            pubmed_text = swap_num(pubmed_text)
            matches = re.findall('([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients|children|people) were (?:randomi[sz]ed)', pubmed_text)
            # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            if len(matches) == 1:
                self.data["y_lookup_init"][study_id] = int(matches[0])
                counter += 1

        self.seed_abstracts = counter
        logging.info("%d seed abstracts found", counter)
示例#7
0
    def get_annotations(self, abstract, convert_numbers=True):
        '''
        if convert_numbers is True, numerical strings (e.g., "twenty-five")
        will be converted to number ("25").
        '''        
        if convert_numbers:
            abstract = swap_num(abstract)
            abstract = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', abstract)

        tags = tag_words(abstract)
        return tags
    def __init__(self, text, window_size):

        if isinstance(text, str):
            self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text)
            self.text = swap_num(text)
            self.tag_tuple_sents = tag_words(self.text)
        elif isinstance(text, list):
            self.tag_tuple_sents = text

        self.functions = self.set_functions(self.tag_tuple_sents)

        self.w_pos_window = window_size
        self.load_templates()
示例#9
0
    def __init__(self, text, window_size):

        if isinstance(text, str):
            self.text = re.sub("(?:[0-9]+)\,(?:[0-9]+)", "", text)
            self.text = swap_num(text)
            self.tag_tuple_sents = tag_words(self.text)
        elif isinstance(text, list):
            self.tag_tuple_sents = text

        self.functions = self.set_functions(self.tag_tuple_sents)

        self.w_pos_window = window_size
        self.load_templates()
    def __init__(self, text_dict, window_size):

        self.functions = []
        for key, value in text_dict.iteritems():
            self.functions.extend(
                [[{
                    "w": word,
                    "p": pos,
                    "cochrane_part": key
                } for word, pos in pos_tagger.tag(self.word_tokenize(sent))]
                 for sent in self.sent_tokenize(swap_num(value))])

        self.load_templates()
        self.w_pos_window = window_size
示例#11
0
def get_annotations(abstract_nr, annotator, convert_numbers=False):
    '''
    if convert_numbers is True, numerical strings (e.g., "twenty-five")
    will be converted to number ("25").
    '''
    abstract = get_abstracts(annotator)[abstract_nr]
    if convert_numbers:
        abstract = swap_num(abstract)


    tags = tag_words(abstract)


    # tags = p.get_tags(flatten=True) # returns a list of tags
    return tags
示例#12
0
    def init_y_regex(self):
        """
        initialises the joint y vector with data from a simple seed regex rule
        """
        logging.info("Identifying seed data from regular expression")
        p = progressbar.ProgressBar(len(self.biviewer), timer=True)
        counter = 0  # number of studies initially found
        for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer):
            p.tap()
            pubmed_text = pubmed_dict.get("abstract", "")
            # use simple rule to identify population sizes (low sens/recall, high spec/precision)
            pubmed_text = swap_num(pubmed_text)
            matches = re.findall('([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients) were (?:randomi[sz]ed)', pubmed_text)
            # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            if len(matches) == 1:
                self.data["y_lookup_init"][study_id] = int(matches[0])
                counter += 1

        self.seed_abstracts = counter
        logging.info("%d seed abstracts found", counter)
示例#13
0
    def init_y_regex(self):
        """
        initialises the joint y vector with data from a simple seed regex rule
        """
        logging.info("Identifying seed data from regular expression")
        p = progressbar.ProgressBar(len(self.biviewer), timer=True)
        counter = 0  # number of studies initially found
        for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer):
            p.tap()
            pubmed_text = pubmed_dict.get("abstract", "")
            # use simple rule to identify population sizes (low sens/recall, high spec/precision)
            pubmed_text = swap_num(pubmed_text)
            matches = re.findall(
                '([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients) were (?:randomi[sz]ed)',
                pubmed_text)
            # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text)
            if len(matches) == 1:
                self.data["y_lookup_init"][study_id] = int(matches[0])
                counter += 1

        self.seed_abstracts = counter
        logging.info("%d seed abstracts found", counter)
 def __init__(self, text, window_size):
     self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text)
     self.functions = [[{"w": word, "p": pos} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()        
     self.w_pos_window = window_size
     self.text = text  
    def __init__(self, text_dict, window_size):

        self.functions = []
        for key, value in text_dict.iteritems():
            self.functions.extend([[{"w": word, "p": pos, "cochrane_part":key} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(value))])

        self.load_templates()        
        self.w_pos_window = window_size
示例#16
0
 def __init__(self, text):
     self.functions = [[{"w": word, "p": pos} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()        
     self.text = text  
示例#17
0
 def __init__(self, text):
     self.text = text
     self.functions = [[{"w": word} for word in self.word_tokenize(sent)] for sent in self.sent_tokenize(swap_num(text))]
     self.load_templates()