def __init__(self, xml_string, filename, is_string=False):
        xh = xml_driver.XMLHandler()
        parser = xml_driver.make_parser()

        parser.setContentHandler(xh)
        parser.setFeature(xml_driver.handler.feature_external_ges, False)
        l = xml.sax.xmlreader.Locator()
        xh.setDocumentLocator(l)
        if is_string:
            parser.parse(StringIO(xml_string))
        else:
            parser.parse(xml_string)

        self.attributes = ['pat','app','assignee_list','patent','inventor_list','lawyer_list',
                     'us_relation_list','us_classifications','ipcr_classifications',
                     'citation_list','claims']

        self.xml = xh.root.us_patent_grant
        self.xml_string = xml_string
        self.country = self.xml.publication_reference.contents_of('country', upper=False)[0]
        self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0])
        self.kind = self.xml.publication_reference.contents_of('kind')[0]
        self.date_grant = self.xml.publication_reference.contents_of('date')[0]
        if self.xml.application_reference:
            self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False)
        else:
            self.pat_type = None
        self.date_app = self.xml.application_reference.contents_of('date')[0]
        self.country_app = self.xml.application_reference.contents_of('country')[0]
        self.patent_app = self.xml.application_reference.contents_of('doc_number')[0]
        self.code_app = self.xml.contents_of('us_application_series_code')[0]
        self.clm_num = self.xml.contents_of('number_of_claims')[0]
        self.abstract = h.unescape(xh.root.us_patent_grant.abstract.contents_of('p', '', as_string=True, upper=False))
        self.invention_title = h.unescape(self._invention_title())
        self.filename = re.search('ipg.*$',filename,re.DOTALL).group()

        self.pat = {
            "id": self.patent,
            "type": self.pat_type,
            "number": self.patent,
            "country": self.country,
            "date": self._fix_date(self.date_grant),
            "abstract": self.abstract,
            "title": self.invention_title,
            "kind": self.kind,
            "num_claims": self.clm_num,
            "filename": self.filename
        }
        self.app = {
            "type": self.code_app,
            "number": self.patent_app,
            "country": self.country_app,
            "date": self._fix_date(self.date_app)
        }
        self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
Пример #2
0
    def __init__(self, xml_string, is_string=False):
        xh = xml_driver.XMLHandler()
        parser = xml_driver.make_parser()

        parser.setContentHandler(xh)
        parser.setFeature(xml_driver.handler.feature_external_ges, False)
        l = xml.sax.xmlreader.Locator()
        xh.setDocumentLocator(l)
        if is_string:
            parser.parse(StringIO(xml_string))
        else:
            parser.parse(xml_string)

        self.attributes = [
            'app', 'application', 'assignee_list', 'inventor_list',
            'us_classifications', 'claims'
        ]

        self.xml = xh.root.patent_application_publication

        if filter(lambda x: not isinstance(x, list),
                  self.xml.contents_of('country_code')):
            self.country = filter(lambda x: not isinstance(x, list),
                                  self.xml.contents_of('country_code'))[0]
        else:
            self.country = ''
        self.application = xml_util.normalize_document_identifier(
            self.xml.application_number.contents_of('doc_number')[0])
        self.kind = self.xml.document_id.contents_of('kind_code')[0]
        self.pat_type = None
        self.date_app = self.xml.domestic_filing_data.contents_of(
            'filing_date')[0]
        self.clm_num = len(self.xml.subdoc_claims.claim)
        self.abstract = self.xml.subdoc_abstract.contents_of('paragraph',
                                                             '',
                                                             as_string=True,
                                                             upper=False)
        self.invention_title = self._invention_title()

        self.app = {
            "id": self.application,
            "type": self.pat_type,
            "number": self.application[2:] + '/' + self.application[2:],
            "country": self.country,
            "date": self._fix_date(self.date_app),
            "abstract": self.abstract,
            "title": self.invention_title,
            "kind": self.kind,
            "num_claims": self.clm_num
        }
        self.app["id"] = str(self.date_app)[:4] + '/' + self.application
Пример #3
0
    def __init__(self, xml_string, is_string=False):
        xh = xml_driver.XMLHandler()
        parser = xml_driver.make_parser()

        parser.setContentHandler(xh)
        parser.setFeature(xml_driver.handler.feature_external_ges, False)
        l = xml.sax.xmlreader.Locator()
        xh.setDocumentLocator(l)
        if is_string:
            parser.parse(StringIO(xml_string))
        else:
            parser.parse(xml_string)

        self.attributes = [
            'app', 'application', 'assignee_list', 'inventor_list',
            'us_classifications', 'claims'
        ]

        self.xml = xh.root.us_patent_application

        self.country = self.xml.application_reference.contents_of(
            'country', upper=False)[0]
        self.application = xml_util.normalize_document_identifier(
            self.xml.application_reference.contents_of('doc_number')[0])
        self.kind = self.xml.publication_reference.contents_of('kind')[0]
        self.date_app = self.xml.application_reference.contents_of('date')[0]
        if self.xml.application_reference:
            self.pat_type = self.xml.application_reference[0].get_attribute(
                'appl-type', upper=False)
        else:
            self.pat_type = None
        self.clm_num = len(self.xml.claims.claim)
        self.abstract = self.xml.abstract.contents_of('p',
                                                      '',
                                                      as_string=True,
                                                      upper=False)
        self.invention_title = self._invention_title()

        self.app = {
            "id": self.application,
            "type": self.pat_type,
            "number": self.application[2:] + '/' + self.application[2:],
            "country": self.country,
            "date": self._fix_date(self.date_app),
            "abstract": self.abstract,
            "title": self.invention_title,
            "kind": self.kind,
            "num_claims": self.clm_num
        }
        self.app["id"] = str(self.date_app)[:4] + '/' + self.application
Пример #4
0
    def __init__(self, xml_string, is_string=False):
        xml_string = self._description_patch(xml_string)
        xml_string = self._claims_patch(xml_string)
        xml_string = self._abstract_patch(xml_string)
        
        xh = xml_driver.XMLHandler()
        self.xh = xh
        parser = xml_driver.make_parser()

        parser.setContentHandler(xh)
        parser.setFeature(xml_driver.handler.feature_external_ges, False)
        l = xml.sax.xmlreader.Locator()
        xh.setDocumentLocator(l)
        if is_string:
            parser.parse(StringIO(xml_string))
        else:
            parser.parse(xml_string)

        self.attributes = ['pat','app','description','assignee_list','patent','inventor_list','lawyer_list',
                     'us_relation_list','us_classifications','ipcr_classifications',
                     'citation_list','claims']

        self.xml = xh.root.us_patent_grant

        self.country = self.xml.publication_reference.contents_of('country', upper=False)[0]
        self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0])
        self.kind = self.xml.publication_reference.contents_of('kind')[0]
        self.date_grant = self.xml.publication_reference.contents_of('date')[0]
        if self.xml.application_reference:
            self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False)
        else:
            self.pat_type = None
        self.date_app = self.xml.application_reference.contents_of('date')[0]
        self.country_app = self.xml.application_reference.contents_of('country')[0]
        self.patent_app = self.xml.application_reference.contents_of('doc_number')[0]
        self.code_app = self.xml.contents_of('us_application_series_code')[0]
        self.clm_num = self.xml.contents_of('number_of_claims')[0]
        self.abstract = xh.root.us_patent_grant.contents_of('abstract', '', as_string=True, upper=False)

        self.briefsummarydescription = xh.root.us_patent_grant.contents_of('brief_summary_description', '', as_string=True, upper=False, clean_text=False)
        self.briefdescriptiondrawings = xh.root.us_patent_grant.contents_of('brief_description_drawings', '', as_string=True, upper=False, clean_text=False)
        self.detaileddescription = xh.root.us_patent_grant.contents_of('detailed_description', '', as_string=True, upper=False, clean_text=False)
        self.otherpatentrelations = xh.root.us_patent_grant.contents_of('other_patent_relations', '', as_string=True, upper=False, clean_text=False)        
        
        self.invention_title = self._invention_title()
        
        self.pat = {
            "id": self.patent,
            "type": self.pat_type,
            "number": self.patent,
            "country": self.country,
            "date": self._fix_date(self.date_grant),
            "abstract": self.abstract,
            "title": self.invention_title,
            "kind": self.kind,
            "num_claims": self.clm_num
        }
        self.app = {
            "type": self.code_app,
            "number": self.patent_app,
            "country": self.country_app,
            "date": self._fix_date(self.date_app)
        }
        self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
        
        self.description = {
            "id": self.patent,
            "briefsummarydescription": self.briefsummarydescription,
            "briefdescriptiondrawings": self.briefdescriptiondrawings,
            "detaileddescription": self.detaileddescription,
            "otherpatentrelations": self.otherpatentrelations
        }
Пример #5
0
from cStringIO import StringIO
from datetime import datetime
from unidecode import unidecode
from handler import Patobj, PatentHandler
import re
import uuid
import xml.sax
import xml_util
import xml_driver

xml_string = 'ipg050104.xml'

xh = xml_driver.XMLHandler()
parser = xml_driver.make_parser()
parser.setContentHandler(xh)
parser.setFeature(xml_driver.handler.feature_external_ges, False)

l = xml.sax.xmlreader.Locator()
xh.setDocumentLocator(l)

#parser.parse(StringIO(xml_string))
parser.parse(xml_string)
print "parsing done"

#print type(xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=False))
print xh.root.claims.contents_of('claim', '', as_string=True, upper=False)

#print type(xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=True))
#print xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=True)
    def __init__(self, xml_string, filename, is_string=False):
        xh = xml_driver.XMLHandler()
        parser = xml_driver.make_parser()
        parser.setContentHandler(xh)
        parser.setFeature(xml_driver.handler.feature_external_ges, False)
        l = xml.sax.xmlreader.Locator()
        xh.setDocumentLocator(l)
        if is_string:
            parser.parse(StringIO(xml_string))
        else:
            parser.parse(xml_string)

        self.attributes = [
            'app', 'application', 'assignee_list', 'inventor_list',
            'us_classifications', 'claims'
        ]

        self.xml = xh.root.patent_application_publication
        self.xml_string = xml_string

        if filter(lambda x: not isinstance(x, list),
                  self.xml.contents_of('country_code')):
            self.country = filter(lambda x: not isinstance(x, list),
                                  self.xml.contents_of('country_code'))[0]
        else:
            self.country = ''
        self.application = xml_util.normalize_document_identifier(
            self.xml.document_id.contents_of('doc_number')[0])
        self.kind = self.xml.document_id.contents_of('kind_code')[0]
        try:
            self.pat_type = type_kind[self.kind]
        except:
            self.pat_type = None
        self.date_app = self.xml.document_id.contents_of('document_date')[0]
        self.clm_num = len(self.xml.subdoc_claims.claim)
        #self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False)
        try:
            self.abstract = re.search(
                '<subdoc-abstract>(.*?)</subdoc-abstract>', xml_string,
                re.DOTALL).group(1)
            self.abstract = re.sub('<.*?>|</.*?>', '', self.abstract)
            self.abstract = re.sub('[\n\t\r\f]+', '', self.abstract)
            self.abstract = re.sub('\s+', ' ', self.abstract)
            self.abstract = h.unescape(self.abstract)
        except:
            self.abstract = ''
        self.invention_title = h.unescape(self._invention_title())
        self.filename = re.search('i?pa[0-9]*.*$', filename, re.DOTALL).group()

        self.app = {
            "id": self.application,
            "type": self.pat_type,
            "number": self.application,
            "country": self.country,
            "date": self._fix_date(self.date_app),
            "abstract": self.abstract,
            "title": self.invention_title,
            "kind": self.kind,
            "num_claims": self.clm_num,
            "filename": self.filename
        }
        self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]