Пример #1
0
def load_internal_conll(files, data_path="data/wikipedia2/"):
    start = time()

    if os.path.exists(data_path + "_".join(files) + ".p"):
        with open(data_path + "_".join(files) + ".p", "rb") as file:
            data = pickle.load(file)
    else:
        columntypes = ["words", "pos", "chunk"]
        conll_reader = ConllCorpusReader(data_path, files, columntypes)

        data = []
        sentences = conll_reader.iob_sents()

        for s in sentences:
            if not s == []:
                w, ne, link = zip(*s)
                stats = {}
                for label in ["O", "ORG", "PER", "LOC", "VESSEL", "MISC"]:
                    stats[label] = np.sum([
                        int(t == "O") if label == "O" else int(label in t)
                        for t in ne
                    ])
                data.append((w, ne, link, stats))

        with open(data_path + "_".join(files) + ".p", "wb") as file:
            pickle.dump(data, file)

    print("Loaded %s in %s seconds" % ('_'.join(files), time() - start))
    return data
Пример #2
0
def load_ner_data_label(root, filename, batchsize, labels2y=None):
    CCR = ConllCorpusReader(root=root,
                            fileids='.conll',
                            columntypes=('words', 'pos', 'ne', 'chunk'))
    for masked_word_lsts, pos_lst, tags in process_tagged_sents(
            CCR.tagged_sents(filename), batchsize, labels2y):
        yield masked_word_lsts, pos_lst, tags
Пример #3
0
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "floresta":
        from nltk.corpus import floresta
        tagged_sents = floresta.tagged_sents()[:num_sents]

    elif corpus.lower() == "cintil":
        print "Loading CINTIL"
        #column_types = ['ignore','words','ignore','ignore','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
        column_types = ['words','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
        cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
        tagged_sents = cintil.tagged_sents()[:num_sents]

    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)

    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)

    maxent_tagger.evaluate(test_sents)

    """
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
    print "\n\n"
    print "show the 40 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(40)
    """

    fModel = open('test.pkl',"wb")
    pickle.dump(maxent_tagger, fModel,1)
    fModel.close()
def get_corpus_reader(fileids,
                      columntypes=(ConllCorpusReader.WORDS,
                                   ConllCorpusReader.POS, ConllCorpusReader.NE,
                                   ConllCorpusReader.IGNORE),
                      root=root):
    corpus_reader = ConllCorpusReader(root, fileids, columntypes)

    return corpus_reader
Пример #5
0
def load_to_nltk(folder):
    """Recursively reads .conll files from a directory tree

    :param folder: the root folder for the corpus
    :returns: a NLTK Corpus object
    """
    fields = ('words', 'ignore', 'ignore', 'ignore', 'pos')
    corpus = ConllCorpusReader(folder, r".*\.conll", fields)
    return corpus
Пример #6
0
def conllReader(corpus):
    '''
    Data reader for CoNLL format data
    '''
    root = "data/"
    sentences = []

    ccorpus = ConllCorpusReader(root, ".conll", ('words', 'pos', 'tree'))

    raw = ccorpus.sents(corpus)

    for sent in raw:
        sentences.append([TreebankWordDetokenizer().detokenize(sent)])

    tagged = ccorpus.tagged_sents(corpus)
    print(tagged)


    return tagged, sentences
Пример #7
0
def test_output(y_pred, args, LABELS, data_fname):
    CCR = ConllCorpusReader(root=args.data,
                            fileids='.conll',
                            columntypes=('words', 'pos', 'ne', 'chunk'))
    print(data_fname)
    tagged_sents = CCR.tagged_sents(data_fname)
    pred_conll = []
    counter = 0
    for tagged_sent in tagged_sents:
        tags_position = return_tag_position(tagged_sent)
        token_lst, tags_lst = zip(*tagged_sent)
        tags_pred = deepcopy(list(tags_lst))
        for tag_pos in tags_position:
            for pos in range(tag_pos[0], tag_pos[1]):
                tags_pred[pos] = tags_lst[pos][:2] + LABELS[y_pred[counter]]

            counter += 1
        pred_conll.append(list(zip(token_lst, tags_lst, tags_pred)))
    return pred_conll
Пример #8
0
    def __init__(self, directory=None, replace_rare_tokens=True, **kwargs):
        self.directory = directory
        # don't load corpus unless directory was passed on object construction
        if self.directory is not None:
            self.directory = data_utils.get_filepaths(directory)
            self.conll_parser = ConllCorpusReader(directory, '.conll',
                                                  ('words', 'pos'))

        self.replace_rare_tokens = replace_rare_tokens

        # word, character and tag sequences from dataset (per partition)
        self.type_seq = {'train': None, 'valid': None, 'test': None}
        # mappings of word, characters, and tag types to unique integer IDs
        self.type_to_idx = {'word': None, 'char': None, 'tag': None}
        # reverse mapping of unique integer IDs to tag types
        self.idx_to_tag = None
        # same as type_seq but all words, characters and tags have been mapped to unique integer IDs
        self.idx_seq = {'train': None, 'valid': None, 'test': None}

        for key, value in kwargs.items():
            setattr(self, key, value)
Пример #9
0
def load_conll03(files=["eng.train", "eng.testa", "eng.testb"], max_len=200):
    start = time()
    columntypes1 = ["words", "pos", "chunk", "ne"]
    columntypes2 = ["words", "pos", "ne", "chunk"]
    conll_reader1 = ConllCorpusReader("data/CoNLL2003/", files, columntypes1)
    conll_reader2 = ConllCorpusReader("data/CoNLL2003/", files, columntypes2)

    words = []
    poses = []
    chunkes = []
    nes = []

    sentences1 = conll_reader1.iob_sents()
    sentences2 = conll_reader2.iob_sents()

    for i, s1 in enumerate(sentences1):
        if not s1 == [] and len(s1) <= max_len:
            w, pos, chunk = zip(*s1)
            _, _, ne = zip(*sentences2[i])
            words.append(list(w))
            poses.append(list(pos))
            chunkes.append(list(chunk))
            nes.append(list(ne))

    print("Loaded CoNLL03 in %s seconds" % (time() - start))

    return words, poses, chunkes, nes
Пример #10
0
def load_conll00(files=["train.txt, test.txt"], max_len=200):
    start = time()
    columntypes = ["words", "pos", "chunk"]
    conll_reader = ConllCorpusReader("data/CoNLL2000/", files, columntypes)

    words = []
    poses = []
    chunkes = []
    nes = []

    sentences = conll_reader.iob_sents()

    for i, s in enumerate(sentences):
        if not s == [] and len(s) <= max_len:
            w, pos, chunk = zip(*s)
            words.append(list(w))
            poses.append(list(pos))
            chunkes.append(list(chunk))

    print("Loaded CoNLL00 in %s seconds" % (time() - start))

    return words, poses, chunkes
Пример #11
0
class Dataset(object):
    """A class for handling datasets. Expects datasets to be in tab-seperated CoNLL format, where
    each line contains a token and its tag (seperated by a tab) and each sentence is seperated
    by a blank line.

    Example corpus:
    '''
    The	O
    transcription	O
    of	O
    most	O
    RP	B-PRGE
    genes	I-PRGE
    ...
    '''

    Args:
        directory (str): Path to directory containing CoNLL formatted dataset(s).
        replace_rare_tokens (bool): True if rare tokens should be replaced with a special unknown
            token. Threshold for considering tokens rare can be found at `saber.constants.NUM_RARE`.
    """
    def __init__(self, directory=None, replace_rare_tokens=True, **kwargs):
        self.directory = directory
        # don't load corpus unless directory was passed on object construction
        if self.directory is not None:
            self.directory = data_utils.get_filepaths(directory)
            self.conll_parser = ConllCorpusReader(directory, '.conll',
                                                  ('words', 'pos'))

        self.replace_rare_tokens = replace_rare_tokens

        # word, character and tag sequences from dataset (per partition)
        self.type_seq = {'train': None, 'valid': None, 'test': None}
        # mappings of word, characters, and tag types to unique integer IDs
        self.type_to_idx = {'word': None, 'char': None, 'tag': None}
        # reverse mapping of unique integer IDs to tag types
        self.idx_to_tag = None
        # same as type_seq but all words, characters and tags have been mapped to unique integer IDs
        self.idx_seq = {'train': None, 'valid': None, 'test': None}

        for key, value in kwargs.items():
            setattr(self, key, value)

    def load(self):
        """Coordinates the loading of a given data set at `self.directory`.

        For a given dataset in CoNLL format at `self.directory`, coordinates the loading of data and
        updates the appropriate instance attributes. Expects `self.directory` to be a directory
        containing a single file, `train.*` and optionally two additional files, `valid.*` and
        `test.*`.

        Raises:
            ValueError if `self.directory` is None.
        """
        if self.directory is None:
            err_msg = "`Dataset.directory` is None; must be provided before call to `Dataset.load`"
            LOGGER.error('ValueError %s', err_msg)
            raise ValueError(err_msg)

        # unique words, chars and tags from CoNLL formatted dataset
        types = self._get_types()
        # map each word, char, and tag type to a unique integer
        self._get_idx_maps(types)

        # get word, char, and tag sequences from CoNLL formatted dataset
        self._get_type_seq()
        # get final representation used for training
        self.get_idx_seq()

        # useful during prediction / annotation
        self.idx_to_tag = generic_utils.reverse_dict(self.type_to_idx['tag'])

    def _get_types(self):
        """Collects the sets of all words, characters and tags in a CoNLL formatted dataset.

        For the CoNLL formatted dataset given at `self.directory`, updates `self.types` with the
        sets of all words (word types), characters (character types) and tags (tag types). All types
        are shared across all partitions, that is, word, char and tag types are collected from the
        train and, if provided, valid/test partitions found at `self.directory/train.*`,
        `self.directory/valid.*` and `self.directory/test.*`.
        """
        types = {
            'word': [constants.PAD, constants.UNK],
            'char': [constants.PAD, constants.UNK],
            'tag': [constants.PAD],
        }

        for _, filepath in self.directory.items():
            if filepath is not None:
                conll_file = os.path.basename(
                    filepath)  # get name of conll file
                types['word'].extend(set(self.conll_parser.words(conll_file)))
                types['char'].extend(
                    set(
                        chain(*[
                            list(w)
                            for w in self.conll_parser.words(conll_file)
                        ])))
                types['tag'].extend(
                    set([
                        tag[-1]
                        for tag in self.conll_parser.tagged_words(conll_file)
                    ]))

        # ensure that we have only unique types
        types['word'] = list(set(types['word']))
        types['char'] = list(set(types['char']))
        types['tag'] = list(set(types['tag']))

        return types

    def _get_type_seq(self):
        """Loads sequence data from a CoNLL format data set given at `self.directory`.

        For the CoNLL formatted dataset given at `self.directory`, updates `self.type_seq` with
        lists containing the word, character and tag sequences for the train and, if provided,
        valid/test partitions found at `self.directory/train.*`, `self.directory/valid.*` and
        `self.directory/test.*`.
        """
        for partition, filepath in self.directory.items():
            if filepath is not None:
                conll_file = os.path.basename(
                    filepath)  # get name of conll file

                # collect sequence data
                sents = list(self.conll_parser.sents(conll_file))
                tagged_sents = list(self.conll_parser.tagged_sents(conll_file))

                word_seq = Preprocessor.replace_rare_tokens(
                    sents) if self.replace_rare_tokens else sents
                char_seq = [[[c for c in w] for w in s] for s in sents]
                tag_seq = [[t[-1] for t in s] for s in tagged_sents]

                # update the class attributes
                self.type_seq[partition] = {
                    'word': word_seq,
                    'char': char_seq,
                    'tag': tag_seq
                }

    def _get_idx_maps(self, types, initial_mapping=None):
        """Updates `self.type_to_idx` with mappings from word, char and tag types to unique int IDs.
        """
        initial_mapping = constants.INITIAL_MAPPING if initial_mapping is None else initial_mapping
        # generate type to index mappings
        self.type_to_idx['word'] = Preprocessor.type_to_idx(
            types['word'], initial_mapping['word'])
        self.type_to_idx['char'] = Preprocessor.type_to_idx(
            types['char'], initial_mapping['word'])
        self.type_to_idx['tag'] = Preprocessor.type_to_idx(
            types['tag'], initial_mapping['tag'])

    def get_idx_seq(self):
        """Updates `self.idx_seq` with the final representation of the data used for training.

        Updates `self.idx_seq` with numpy arrays, by using `self.type_to_idx` to map all elements
        in `self.type_seq` to their corresponding integer IDs, for the train and, if provided,
        valid/test partitions found at `self.directory/train.*`, `self.directory/valid.*` and
        `self.directory/test.*`.
        """
        for partition, filepath in self.directory.items():
            if filepath is not None:
                self.idx_seq[partition] = {
                    'word':
                    Preprocessor.get_type_idx_sequence(
                        self.type_seq[partition]['word'],
                        self.type_to_idx['word'],
                        type_='word'),
                    'char':
                    Preprocessor.get_type_idx_sequence(
                        self.type_seq[partition]['word'],
                        self.type_to_idx['char'],
                        type_='char'),
                    'tag':
                    Preprocessor.get_type_idx_sequence(
                        self.type_seq[partition]['tag'],
                        self.type_to_idx['tag'],
                        type_='tag'),
                }
                # one-hot encode our targets
                self.idx_seq[partition]['tag'] = to_categorical(
                    self.idx_seq[partition]['tag'])
Пример #12
0
from nltk.corpus.reader.conll import ConllCorpusReader
"""
Script to append end token to every sentence in eval set
in order to test properly.
New file is created.
"""

END_TOKEN = "<END>"

corpus = ConllCorpusReader("data", ".tt", ["words", "pos"])
result = list()

for sent in corpus.tagged_sents("de-eval.tt"):
    sent.append((END_TOKEN, END_TOKEN))

    result.append(sent)

try:
    with open("./data/de-eval_end.tt", 'w') as conll_file:
        for sent in result:
            for pair in sent:
                conll_file.write("\t".join(pair) + '\n')
            conll_file.write('\n')
except FileNotFoundError:
    print("Not able to open the file for test writing!")
Пример #13
0
    def features(self, dict):
        d = defaultdict(int)
        """
        for ii in kTOKENIZER.tokenize(dict['Question Text']):
            d[morphy_stem(ii)] += 1
        """
        qd = form_dict(dict['QANTA Scores'])
        wd = form_dict(dict['IR_Wiki Scores'])
        sp = int(dict['Sentence Position'])
        qtext = dict['Question Text']
        list_words = []
        
        sorted_qd = sorted(qd.items(), key=operator.itemgetter(1), reverse=True)
        sorted_wd = sorted(wd.items(), key=operator.itemgetter(1), reverse=True)
        overlap = 0
        consider_qanta = 0
        """
        if sorted_qd[0][0] == sorted_wd[0][0] and sorted_qd[0][1] > 0.5:
            consider_qanta = 1
        """
        
        for w in kTOKENIZER.tokenize(qtext):
            if consider_qanta != 0:
                break
            if morphy_stem(w) not in stopwords.words('english') and w[0] not in PUNC:
                list_words.append(morphy_stem(w))
        #bigrams_qtext = list(bigrams(list_words))
        trigrams_qtext = list(trigrams(list_words))
        highest = 0
        rnk = 0
        
        for i in xrange(10):
            if consider_qanta != 0:
                break
            fid = "./wikipedia/"+sorted_qd[i][0][0]+"/"+sorted_qd[i][0]
            v = 0
            if os.path.exists(fid):
                fids = [fid]
                cr = ConllCorpusReader("", fids, COLUMN_TYPES)
                c = 0
                for r in cr.iob_sents():
                    if c > 3:
                        break
                    wiki_words = []
                    for j in xrange(len(r)):
                        w = morphy_stem(r[j][1].lower())
                        if w not in stopwords.words('english') and w[0] not in PUNC:
                            wiki_words.append(w)
                
                    for j in xrange(len(wiki_words)-2):
                        bgrm = (wiki_words[j],wiki_words[j+1],wiki_words[j+2])
                        if bgrm in trigrams_qtext:
                            ky = "Bigrams"+str(i)
                            d[ky] += TOPGUESSES - i
                            v += 1
                            print "QID:",dict['Question ID'],"SP=",sp,"\t", rnk,v, bgrm[0],bgrm[1],bgrm[2]
                    c += 1
            if v > highest:
                highest = v
                rnk = i
        d['bigrams'] = rnk
        """
        highest = 0
        rnk = 0
        qwords = kTOKENIZER.tokenize(qtext)
        for i in xrange(10):
            fid = "./wikipedia/"+sorted_qd[i][0][0]+"/"+sorted_qd[i][0]
            if os.path.exists(fid):
                fids = [fid]
                cr = ConllCorpusReader("", fids, COLUMN_TYPES)
                
                c = 0
                v = 0
                for r in cr.iob_sents():
                    if c > 4:
                        break
                    for j in xrange(len(r)):
                        if r[j][1].lower() not in stopwords.words('english') and r[j][1][0] not in PUNC and r[j][1].lower() in qwords:
                            v += 1
                    
                    c += 1
                if v > highest:
                    highest = v
                    rnk = i
        d['unigram'] = rnk
        """

        for qg in xrange(len(sorted_qd)):
            if sorted_qd[qg][0] == sorted_wd[0][0]:
                overlap = 1
                d['Top IR Overlap'] += qg
                """
                if  (sorted_qd[0][1] - sorted_qd[1][1]) < 0.15:
                    d['Top IR Overlap'] += qg
                else:
                    d['Top IR Overlap'] -= qg
                """
                break
                
        if overlap == 0:
            if sorted_qd[0][1] < 0.05 and sorted_wd[0][1] > 4.0:
                d['Top IR'] += 1
            if sp == 0 and sorted_qd[0][1] < 0.1:
                d['Top_IR'] += int(sorted_wd[0][1])

        for i in xrange(4*sp):
            if i < len(sorted_qd) and sorted_qd[i][0] == sorted_wd[i][0]:
                d['Equal Rank'] += i + 2
        if sorted_qd[0][1] > 0.75:
            d['Q Score'] = 0
        elif sorted_qd[0][1] > 0.39:
            d['Q Score'] = 1
        elif sorted_qd[0][1] > 0.24:
            d['Q Score'] = 2
        else:
            d['Q Score'] = -1
        if sorted_wd[0][1] > 20.0 and sorted_qd[0][1] < 0.25:
            d['IR Score'] = TOPGUESSES
        
        d['Sentence Position'] = sp
        
        return d