示例#1
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                for line in file:
                    if self.print_status:
                        lc.update("Lemma Para Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    para_id = data['paragraphID']
                    xml = data['annotation']

                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    para_list = []
                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                para_list.append(token_cleaned[j])
                    yield doc_id, para_id, para_list
示例#2
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Posbigram Sent Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    id = data['id']
                    if lastid != id:
                        para_num = 0
                    else:
                        para_num += 1
                    lastid = id

                    token_list = mf.xml2words(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            yield id, para_num, utils.makeBigrams(pos_cleaned)
示例#3
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)
                abs_list = []

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Lemma Doc Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    xml = data['annotation']

                    if lastid != doc_id and len(abs_list) > 0:
                        # Yield Stuff
                        yield lastid, abs_list
                        abs_list = []

                    lastid = doc_id
                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                abs_list.append(token_cleaned[j])
                if len(abs_list) > 0:
                    # Yield Stuff
                    yield lastid, abs_list