def main(year, doctype='grant'): # get patents as iterator to save memory # use subqueryload to get better performance by using less queries on the backend: # --> http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#eager-loading session = alchemy.fetch_session(dbtype=doctype) schema = alchemy.schema.Patent if doctype == 'application': schema = alchemy.schema.App_Application if year: patents = (p for p in session.query(schema).filter(extract('year', schema.date) == gyear).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) else: patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) i = 0 for patent in patents: i += 1 if i % 100000 == 0: print i, datetime.now() try: # create common dict for this patent loc = patent.rawinventors[0].rawlocation mainclass = patent.classes[0].mainclass_id if patent.classes else '' subclass = patent.classes[0].subclass_id if patent.classes else '' row = {'number': patent.number, 'mainclass': mainclass, 'subclass': subclass, 'state': loc.state if loc else '', 'country': loc.country if loc else '', 'city': loc.city if loc else '', } row['assignee'] = get_assignee_id(patent.assignees[0]) if patent.assignees else '' row['rawassignee'] = get_assignee_id(patent.rawassignees[0]) if patent.rawassignees else '' # generate a row for each of the inventors on a patent for ri in patent.rawinventors: namedict = {'name_first': ri.name_first, 'uuid': ri.uuid} raw_name = ri.name_last.split(' ') # name_last is the last space-delimited word. Middle name is everything before that name_middle, name_last = ' '.join(raw_name[:-1]), raw_name[-1] namedict['name_middle'] = name_middle namedict['name_last'] = name_last tmprow = row.copy() tmprow.update(namedict) newrow = normalize_utf8(ROW(tmprow)) with codecs.open('disambiguator.csv', 'a', encoding='utf-8') as csv: csv.write(newrow) except Exception as e: print e continue
def main(year, doctype): # get patents as iterator to save memory # use subqueryload to get better performance by using less queries on the backend: # --> http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#eager-loading session = alchemy.fetch_session(dbtype=doctype) schema = alchemy.schema.Patent if doctype == 'application': schema = alchemy.schema.App_Application if year: patents = (p for p in session.query(schema).filter(extract('year', schema.date) == year).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) else: patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) i = 0 for patent in patents: i += 1 if i % 100000 == 0: print i, datetime.now() try: # create common dict for this patent primrawloc = patent.rawinventors[0].rawlocation if primrawloc: primloc = patent.rawinventors[0].rawlocation.location else: primloc = primrawloc mainclass = patent.classes[0].mainclass_id if patent.classes else '' subclass = patent.classes[0].subclass_id if patent.classes else '' row = {'number': patent.id, 'mainclass': mainclass, 'subclass': subclass, 'ignore': 0, } if doctype == 'grant': row['isgrant'] = 1 elif doctype == 'application': row['isgrant'] = 0 if patent.granted == True: row['ignore'] = 1 row['assignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else '' row['assignee'] = row['assignee'].split('\t')[0] row['rawassignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else '' row['rawassignee'] = row['rawassignee'].split('\t')[0] # generate a row for each of the inventors on a patent for ri in patent.rawinventors: if not len(ri.name_first.strip()): continue namedict = {'name_first': ri.name_first, 'uuid': ri.uuid} raw_name = ri.name_last.split(' ') # name_last is the last space-delimited word. Middle name is everything before that name_middle, name_last = ' '.join(raw_name[:-1]), raw_name[-1] namedict['name_middle'] = name_middle namedict['name_last'] = name_last rawloc = ri.rawlocation if rawloc: if rawloc.location: loc = rawloc.location else: loc = primloc else: loc = primloc namedict['state'] = loc.state if loc else ''# if loc else rawloc.state if rawloc else primloc.state if primloc else '' namedict['country'] = loc.country if loc else ''# if loc else rawloc.country if rawloc else primloc.country if primloc else '' namedict['city'] = loc.city if loc else ''# if loc else rawloc.city if rawloc else primloc.city if primloc else '' if '??' in namedict['state'] or len(namedict['state']) == 0: namedict['state'] = rawloc.state if rawloc else primloc.state if primloc else '' if '??' in namedict['country'] or len(namedict['country']) == 0: namedict['country'] = rawloc.country if rawloc else primloc.country if primloc else '' if '??' in namedict['city'] or len(namedict['city']) == 0: namedict['city'] = rawloc.city if rawloc else primloc.city if primloc else '' tmprow = row.copy() tmprow.update(namedict) newrow = normalize_utf8(ROW(tmprow)) with codecs.open('disambiguator.csv', 'a', encoding='utf-8') as csv: csv.write(newrow) except Exception as e: print e continue
def main(year, doctype): # get patents as iterator to save memory # use subqueryload to get better performance by using less queries on the backend: # --> http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#eager-loading session = alchemy.fetch_session(dbtype=doctype) schema = alchemy.schema.Patent if doctype == 'application': schema = alchemy.schema.App_Application if year: patents = (p for p in session.query(schema).filter(extract('year', schema.date) == year).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) else: patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1)) else: if year: patents = (p for p in session.query(schema).filter(extract('year', schema.date) == year).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('current_classes')).yield_per(1)) else: patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('current_classes')).yield_per(1)) i = 0 for patent in patents: i += 1 if i % 100000 == 0: print i, datetime.now() try: # create common dict for this patent primrawloc = patent.rawinventors[0].rawlocation if primrawloc: primloc = patent.rawinventors[0].rawlocation.location else: primloc = primrawloc if doctype == 'application': mainclass = patent.classes[0].mainclass_id if patent.classes else '' subclass = patent.classes[0].subclass_id if patent.classes else '' else: mainclass = patent.current_classes[0].mainclass_id if patent.current_classes else '' subclass = patent.current_classes[0].subclass_id if patent.current_classes else '' row = {'number': patent.id, 'mainclass': mainclass, 'subclass': subclass, 'ignore': 0, } if doctype == 'grant': row['isgrant'] = 1 elif doctype == 'application': row['isgrant'] = 0 if patent.granted == True: row['ignore'] = 1 row['assignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else '' row['assignee'] = row['assignee'].split('\t')[0] row['rawassignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else '' row['rawassignee'] = row['rawassignee'].split('\t')[0] # generate a row for each of the inventors on a patent for ri in patent.rawinventors: if not len(ri.name_first.strip()): continue namedict = {'uuid': ri.uuid} parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX, ri.name_first + ' ' + ri.name_last) namedict['name_first'] = ' '.join(filter(None, (parsedNames.Prefix, parsedNames.GivenName))) namedict['name_middle'] = parsedNames.OtherName if parsedNames.OtherName is not None else '' namedict['name_last'] = ' '.join(filter(None, (parsedNames.FamilyName, parsedNames.Suffix))) rawloc = ri.rawlocation if rawloc: if rawloc.location: loc = rawloc.location else: loc = primloc else: loc = primloc namedict['state'] = loc.state if loc else ''# if loc else rawloc.state if rawloc else primloc.state if primloc else '' namedict['country'] = loc.country if loc else ''# if loc else rawloc.country if rawloc else primloc.country if primloc else '' namedict['city'] = loc.city if loc else ''# if loc else rawloc.city if rawloc else primloc.city if primloc else '' if '??' in namedict['state'] or len(namedict['state']) == 0: namedict['state'] = rawloc.state if rawloc else primloc.state if primloc else '' if '??' in namedict['country'] or len(namedict['country']) == 0: namedict['country'] = rawloc.country if rawloc else primloc.country if primloc else '' if '??' in namedict['city'] or len(namedict['city']) == 0: namedict['city'] = rawloc.city if rawloc else primloc.city if primloc else '' tmprow = row.copy() tmprow.update(namedict) newrow = normalize_utf8(ROW(tmprow)) with codecs.open('disambiguator.csv', 'a', encoding='utf-8') as csv: csv.write(newrow) except Exception as e: print e continue