def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("Lemma Para Stream") data = json.loads(line) doc_id = data['id'] para_id = data['paragraphID'] xml = data['annotation'] token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) para_list = [] for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): para_list.append(token_cleaned[j]) yield doc_id, para_id, para_list
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) lastid = None for line in file: if self.print_status: lc.update("Posbigram Sent Stream") data = json.loads(line) xml = data['annotation'] id = data['id'] if lastid != id: para_num = 0 else: para_num += 1 lastid = id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: yield id, para_num, utils.makeBigrams(pos_cleaned)
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) abs_list = [] lastid = None for line in file: if self.print_status: lc.update("Lemma Doc Stream") data = json.loads(line) doc_id = data['id'] xml = data['annotation'] if lastid != doc_id and len(abs_list) > 0: # Yield Stuff yield lastid, abs_list abs_list = [] lastid = doc_id token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): abs_list.append(token_cleaned[j]) if len(abs_list) > 0: # Yield Stuff yield lastid, abs_list