def iter_xml(self): """ Generator for xml file in corpus. """ for filename in self.first_level_files: names = self.get_archive_names(filename) for name in names: if self.correct_file(name): filedata = self.read_archive_file(filename, name) if filedata: yield XMLDoc(filedata)
def read_archive_file(self, filename): """ Read large XML file from Zip. Returns individual documents from file """ with zipfile.ZipFile( os.path.join(self.path, filename), 'r' ) as z: for sl, el, filedata in separated_xml_with_lines(z): yield sl, el, XMLDoc(filedata)
def get_patentdoc(self, publication_number): """ Return a PatentDoc object for a given publication number.""" try: filename, name = self.search_files(publication_number) if filename and name: return XMLDoc( self.read_archive_file(filename, name) ).to_patentdoc() except: return None
def iter_filter_xml(self, classification, sample_size=None): """ Generator to return xml that matches has classification. :param classification: list in form ["G", "61", "K", "039", "00"]. If an entry has None or no entry, it and its remaining entries are not filtered. """ records = self.get_records(classification, sample_size) filegenerator = self.iter_read(records) # Iterate through records and return XMLDocs for _, filedata in filegenerator: if filedata: yield XMLDoc(filedata)
def get_doc(self, publication_number): """ Get XML for publication number. """ try: description = self.registered_client.published_data( reference_type='publication', input=epo_ops.models.Epodoc(publication_number), endpoint='description').text claims = self.registered_client.published_data( reference_type='publication', input=epo_ops.models.Epodoc(publication_number), endpoint='claims').text except: warnings.warn("Full text document not available") description = claims = None if description and claims: return XMLDoc(description, claims)
def read_by_offset(self, filename, offset): """ Get XML from zip file with filename starting at line offset. """ with zipfile.ZipFile( os.path.join(self.path, filename), 'r' ) as z: return XMLDoc(get_xml_by_line_offset(z, offset))
def documents(self): for _, filedata in self.datasource.iter_read(self.filelist): yield XMLDoc(filedata).to_patentdoc()
def xmldoc_generator( self, classification=None, publication_numbers=None, sample_size=None ): """ Generator to return XML Doc objects. If classification is supplied results are limited to that classification (of form ["G", "06"], length 1 to 5). If publication_numbers is supplied as list, results are limited to those publication numbers. (classification and publication filtering is XOR) If sample_size is provided returned documents are limited to this integer. """ # If parameters are passed iterate through whole datasource if not classification and not publication_numbers: if sample_size: query_string = ( "SELECT ROWID, filename, name FROM files" " WHERE ROWID IN" "(SELECT ROWID FROM files ORDER BY RANDOM() LIMIT ?)" ) records = self.c.execute( query_string, (sample_size,)).fetchall() else: query_string = ( "SELECT ROWID, filename, name FROM files" ) records = self.c.execute(query_string).fetchall() filereader = self.iter_read(records) for _, filedata in filereader: if filedata: yield XMLDoc(filedata) # If a list of publication numbers are supplied if publication_numbers: if sample_size and len(publication_numbers) > sample_size: # Randomly sample down to sample_size publication_numbers = random.sample( publication_numbers, sample_size ) # Below is alternate method """ query_string = ("SELECT ROWID, filename, name FROM files" " WHERE pub_no IN ({0})").format( ', '.join(['?'] * len(publication_numbers) ) records = self.c.execute( query_string, publication_numbers).fetchall()""" for publication_number in publication_numbers: result = self.get_patentdoc(publication_number) if result: yield result # If a classification is supplied if classification: filegenerator = self.iter_filter_xml(classification, sample_size) for xmldoc in filegenerator: yield xmldoc
def get_classification(self, filedata): """ Return patent classifications as a list of 5 items.""" return XMLDoc(filedata).classifications()