def read_docs(self, idref=None): if (not self.__docs): self.__docs = dict() ids_doc = [] if idref: ids_doc.append([ '%04.3f' % (id / 1000) for id in self._id_files if idref >= id ][-1]) else: ids_doc = ['%04.3f' % (id / 1000) for id in self._id_files] for id in ids_doc: files = open(self.local_file_doc % id, 'r').read().split( '********************************************') for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() if (new_line['id']): d = Document(id=int(new_line['id']), text=new_line['text']) #self.__docs[d.id] = d yield d #return self.__docs.values() else: for d in self.__docs.values(): yield d
def read_doc(self, id): files = open(self.local_file_doc, 'r').read().split('.I') for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() if (new_line['id'] == str(id)): return Document(id=new_line['id'], text=new_line['text'])
def read_doc(self, id): if (not self.__docs): files = open(self.local_file_doc, 'r').read().split('/') for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() if(int(new_line['id'])==int(id)): return Document(id=new_line['id'],text=new_line['text']) else: return self.__docs[int(id)]
def read_doc(self, id): files = open(self.local_file_doc, 'r', encoding="utf-8").read().split('.I') for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() if (int(new_line['id']) == id): dc = new_line['text'].strip() with open(self.path + "/" + dc + ".txt") as f: return Document(id=int(new_line['id']), text=f.read(), name=dc)
def read_docs(self): if (not self.__docs): self.__docs = dict() files = open(self.local_file_doc, 'r').read().split('.I') for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() d = Document(id=new_line['id'], text=new_line['text']) #self.__docs[d.id] = d yield d else: for d in self.__docs.values(): yield d
def read_docs(self): if (not self.__docs): self.__docs = dict() files = re.compile('\n\.I').split(open(self.local_file_doc, 'r').read()) for line in files: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() d = Document(id=new_line['id'],text=new_line['title']+new_line['text']) self.__docs[d.id] = d yield d else: if line.strip(): print("Not match " + line) else: for d in self.__docs.values(): yield d
def read_docs(self): self.__docs = dict() files = open(self.local_file_doc, 'r', encoding="utf-8").read().split('.I') for line in files: if line: match = re.match(self._pattern, line, re.DOTALL) if match: new_line = match.groupdict() dc = new_line['text'].strip() with open(self.path + "/" + dc + ".txt", encoding="utf-8") as f: d = Document(id=int(new_line['id']), text=f.read(), name=dc) self.__docs[d.id] = d yield d
def read_docs(self): return [ Document(id=i, text=v) for i, v in enumerate(self.local_file_doc) ]
def read_doc(self, id): return Document(id=id, text=self.local_file_doc[id])