def try_to_link(self, vendor_name): """TODO.""" search_results = self.search_sos(vendor_name) total_hits = self.get_total_hits(search_results) if total_hits == 1: log.info("Perfect match for %s", vendor_name) self.process_direct_hit(search_results, vendor_name)
def _match_contract(self, document): ''' Match a particular contract. TODO: Better description :params document: A Python-DocumentCloud object representing a contract :type document: Python-DocumentCloud object. ''' log.info('Syncing document %s', document.id) fields = {} fields['purchaseno'] = self._get_metadata(document, "purchase order") fields['contractno'] = self._get_metadata(document, "contract number") fields['vendor'] = self._get_metadata(document, "vendor").replace(".", "") fields['department'] = self._get_metadata(document, "vendor").replace(".", "") fields['dateadded'] = document.created_at fields['title'] = document.title fields['description'] = document.description LensDatabase().add_department(fields['department']) LensDatabase().add_vendor(fields['vendor']) fields['department'] = LensDatabase().get_department_id( fields['department']) fields['vendor'] = LensDatabase().get_lens_vendor_id(fields['vendor']) LensDatabase().update_contract_from_document_cloud(document.id, fields)
def _get_attachment_display_name(self, city_attachment_id): '''docstring''' response = urllib2.urlopen( 'http://www.purchasing.cityofno.com/bso/external/document/' + 'attachments/attachmentFileDetail.sdo?' + 'fileNbr=%s' % city_attachment_id + '&docId=%s' % self.purchaseorder + '&docType=P&releaseNbr=0&parentUrl=/external/purchaseorder/' + 'poSummary.sdo&external=true' ) html = response.read() file_location = '%s/%s.html' % (ATTACHMENTS_DIR, city_attachment_id) if not os.path.exists(os.path.dirname(file_location)): os.makedirs(os.path.dirname(file_location)) with open(file_location, 'w') as filename: log.info('Saving HTML for attachment %s', city_attachment_id) filename.write(html) soup = BeautifulSoup(html) header = soup.select(".sectionheader-01")[0].contents.pop() header = ' '.join(header.split()) attachment_file_name = str(header).replace( "Attachment File Detail:", "").strip() return attachment_file_name
def _match_contract(self, document): ''' Match a particular contract. TODO: Better description :params document: A Python-DocumentCloud object representing a contract :type document: Python-DocumentCloud object. ''' log.info('Syncing document %s', document.id) fields = {} fields['purchaseno'] = self._get_metadata(document, "purchase order") fields['contractno'] = self._get_metadata(document, "contract number") fields['vendor'] = self._get_metadata(document, "vendor").replace(".", "") fields['department'] = self._get_metadata(document, "vendor").replace(".", "") fields['dateadded'] = document.created_at fields['title'] = document.title fields['description'] = document.description LensDatabase().add_department(fields['department']) LensDatabase().add_vendor(fields['vendor']) fields['department'] = LensDatabase().get_department_id(fields['department']) fields['vendor'] = LensDatabase().get_lens_vendor_id(fields['vendor']) LensDatabase().update_contract_from_document_cloud(document.id, fields)
def _get_vendor_name(self): # , soup): ''' Find the vendor name in the contract HTML. If that fails, then ___ :param soup: A BeautifulSoup object for the contract page HTML. :type soup: BeautifulSoup object. :returns: string. The contract vendor's name. ''' vendor_file_location = '%s/%s.html' % (VENDORS_DIR, self.vendor_id_city) # Downloaded this file in _download_vendor_profile() with open(vendor_file_location, 'r') as myfile: log.info('Reading HTML for vendor %s', self.vendor_id_city) html = myfile.read() soup = BeautifulSoup(html) vendor_row = soup(text='Company Name:')[0].parent.parent vendor_name = (vendor_row.findChildren(['td' ])[5].contents.pop().strip()) # Convert to uppercase for DocumentCloud project metadata. # Search queries are also converted to uppercase. vendor_name = vendor_name.upper() return vendor_name
def _backup(self, document_cloud_id): '''Backup a contract.''' needs_backup = self._needs_to_be_backed_up(document_cloud_id) if needs_backup or self.force: log.info("Creating backup for %s", document_cloud_id) document = self.client.documents.get(document_cloud_id) metadata = self._get_meta_data(document) pdf_path = self._get_path(document_cloud_id, ".pdf") pdf_exists = os.path.exists(pdf_path) if not pdf_exists or self.force: pdf = document.pdf with open(pdf_path, "wb") as outfile: outfile.write(pdf) txt_path = self._get_path(document_cloud_id, ".txt") txt_exists = os.path.exists(txt_path) if not txt_exists or self.force: with open(txt_path, "wb") as outfile: outfile.write(json.dumps(metadata)) text_txt_path = self._get_path(document_cloud_id, "_text.txt") text_txt_exists = os.path.exists(text_txt_path) if not text_txt_exists or self.force: with open(text_txt_path, "wb") as outfile: outfile.write(json.dumps(document.full_text)) else: log.info("%s is already is backed up", document_cloud_id)
def add_name(self, name): """TODO.""" name = name.replace(".", "").strip() if self.is_this_a_person(name): # people with Jr ect at the end of the name are people indb = (SESSION.query(Person).filter(Person.name == name).count()) if indb == 0: SESSION.add(Person(name)) SESSION.commit() return if indb == 1: SESSION.close() return if self._is_this_a_company(name): indb = (SESSION.query(Company).filter( Company.name == name).count()) if indb == 0: SESSION.add(Company(name)) SESSION.commit() return if indb == 1: SESSION.close() return log.info("Could not link %s", name) SESSION.close()
def __init__(self, purchase_order_number): self.purchaseorder = purchase_order_number validity = Utilities().check_if_valid_purchase_order_format( self.purchaseorder) if validity is False: log.debug('Purchase order %s is invalid', self.purchaseorder) return html = self._get_html() self.vendor_id_city = self._get_city_vendor_id(html) self._download_vendor_profile(self.vendor_id_city) soup = BeautifulSoup(html) self.description = self._get_description(soup) try: self.vendor_name = self._get_vendor_name() except IOError as e: log.error(e, exc_info=True) self.vendor_name = "unknown" log.info('No vendor info for purchase order %s', self.purchaseorder) self.department = self._get_department(soup) self.k_number = self._get_knumber(soup) self.attachments = self._get_attachments(soup) self.data = self._get_data() self.title = "%s : %s" % (self.vendor_name, self.description)
def check_if_need_to_scrape(self, page): """ If page is <= 10 and last scrape was before today, scrape it. If page is > 10 and last scrape was more than seven days ago, scrape. :params page: The purchasing site page number to check. :type page: int. :returns: boolean. True if need to scrape, False if not. """ today_date = date.today() week_ago_date = date.today() - timedelta(days=7) date_last_scraped = self._check_when_last_scraped(page) if date_last_scraped is None: return True # Scrape this page elif page <= 10: if date_last_scraped < today_date: return True # Scrape this page else: log.info('Skipping page %d. It was scraped recently', page) return False elif page > 10: if date_last_scraped < week_ago_date: return True # Scrape this page else: log.info('Skipping page %d. It was scraped recently', page) return False
def _download_attachment(self, attachment): ''' Download an attachment associated with a purchase order. :param attachment: The name of the attachment file to download. :type attachment: string ''' # The city's purchasing site has an internal ID for each attachment. # Here we use it to download the attachment files, and also to store # locally so we can have a list of the attachments we have on hand. city_attachment_id = re.search( '[0-9]+', attachment.get('href')).group() log.debug('Gathering data for attachment %s', city_attachment_id) document_path = '%s/%s.pdf' % (DOCUMENTS_DIR, city_attachment_id) display_name = self._get_attachment_display_name(city_attachment_id) if os.path.isfile(document_path): # Have already downloaded log.info('Already have PDF for attachment %s', city_attachment_id) else: self._download_attachment_file(city_attachment_id, display_name, document_path)
def _get_attachments(soup): ''' Find the attachments to download from the contract page HTML. :param soup: A BeautifulSoup object for the contract page's HTML. :type soup: BeautifulSoup object :returns: ??? ''' try: main_table = soup.select('.table-01').pop() metadatarow = (main_table .findChildren(['tr'])[2] .findChildren(['td'])[0] .findChildren(['table'])[0] .findChildren(['tr'])) attachment_filenames = (metadatarow[16] .findChildren(['td'])[1] .findChildren(['a'])) return attachment_filenames except IndexError: log.info('No attachments found') return [] # The city does not always include attachment files.
def _get_vendor_name(self): # , soup): ''' Find the vendor name in the contract HTML. If that fails, then ___ :param soup: A BeautifulSoup object for the contract page HTML. :type soup: BeautifulSoup object. :returns: string. The contract vendor's name. ''' vendor_file_location = '%s/%s.html' % (VENDORS_DIR, self.vendor_id_city) # Downloaded this file in _download_vendor_profile() with open(vendor_file_location, 'r') as myfile: log.info('Reading HTML for vendor %s', self.vendor_id_city) html = myfile.read() soup = BeautifulSoup(html) vendor_row = soup(text='Company Name:')[0].parent.parent vendor_name = (vendor_row .findChildren(['td'])[5] .contents.pop().strip()) # Convert to uppercase for DocumentCloud project metadata. # Search queries are also converted to uppercase. vendor_name = vendor_name.upper() return vendor_name
def _get_attachment_display_name(self, city_attachment_id): '''docstring''' response = urllib2.urlopen( 'http://www.purchasing.cityofno.com/bso/external/document/' + 'attachments/attachmentFileDetail.sdo?' + 'fileNbr=%s' % city_attachment_id + '&docId=%s' % self.purchaseorder + '&docType=P&releaseNbr=0&parentUrl=/external/purchaseorder/' + 'poSummary.sdo&external=true') html = response.read() file_location = '%s/%s.html' % (ATTACHMENTS_DIR, city_attachment_id) if not os.path.exists(os.path.dirname(file_location)): os.makedirs(os.path.dirname(file_location)) with open(file_location, 'w') as filename: log.info('Saving HTML for attachment %s', city_attachment_id) filename.write(html) soup = BeautifulSoup(html) header = soup.select(".sectionheader-01")[0].contents.pop() header = ' '.join(header.split()) attachment_file_name = str(header).replace("Attachment File Detail:", "").strip() return attachment_file_name
def _check_if_need_to_download_contract(purchase_order_number): ''' Determines whether this contract should be downloaded, and also whether it needs to be added to our DocumentCloud and local database. :param purchase_order_number: The contract's purchase order number. :type purchase_order_number: string ''' log.info('Checking purchase order %s', purchase_order_number) # Check local file repository try: log.debug('LensRepository') need_to_download = LensRepository( purchase_order_number).check_if_need_to_download() if need_to_download: LensRepository(purchase_order_number).download_purchase_order() except urllib2.HTTPError: log.exception('Purchase order %s not posted publically', purchase_order_number) try: log.debug('PurchaseOrder') purchase_order_object = PurchaseOrder(purchase_order_number) purchase_order_object.download_attachments() except IndexError: log.exception(purchase_order_number) return # Check DocumentCloud project try: log.debug('DocumentCloudProject') need_to_upload = DocumentCloudProject().check_if_need_to_upload( purchase_order_number) if need_to_upload: DocumentCloudProject().prepare_then_add_contract( purchase_order_object) except urllib2.HTTPError: log.exception('Purchase order %s not posted publically', purchase_order_number) # Check local database try: log.debug('LensDatabase') contract_exist = LensDatabase().check_if_database_has_contract( purchase_order_number) if contract_exist is False: LensDatabase().add_to_database(purchase_order_object) except urllib2.HTTPError: log.exception('Purchase order %s is not posted publically.', purchase_order_number)
def _get_html(self): '''Read the HTML contents of this purchase order file. :returns: string. The HTML contains for this purchase order file. ''' file_location = '%s/%s.html' % (PURCHASE_ORDER_DIR, self.purchaseorder) # Purchase order HTML saved in PurchaseOrder class with open(file_location, 'r') as html_file: log.info('Reading HTML for purchase order %s', self.purchaseorder) return html_file.read()
def link(self, name, vendor): """Link the vendor to the company.""" name = name.strip("\n").replace(".", "").strip() # get the vendor: vendorindb = (SESSION.query(Vendor) .filter(Vendor.name == vendor) .first()) # get the person: personindb = (SESSION.query(Person) .filter(Person.name == name) .first()) co = (SESSION.query(Company) .filter(Company.name == name)) companyindb = co.first() # get the company if personindb is not None and companyindb is None: link = (SESSION.query(VendorOfficer) .filter(VendorOfficer.vendorid == vendorindb.id) .filter(VendorOfficer.personid == personindb.id) .count()) if vendorindb is not None and personindb is not None and link < 1: log.info("Linking {0} to {1}", str(vendorindb.id), str(personindb.id)) link = VendorOfficer(vendorindb.id, personindb.id) SESSION.add(link) SESSION.commit() return if companyindb is not None and personindb is None: link = (SESSION.query(VendorOfficerCompany) .filter(VendorOfficerCompany.vendorid == vendorindb.id) .filter(VendorOfficerCompany.companiesid == companyindb.id) .count()) if vendorindb is not None and companyindb is not None and link < 1: print("Linking {0} to {1}".format( str(vendorindb.id), str(companyindb.id) )) link = VendorOfficerCompany(vendorindb.id, companyindb.id) SESSION.add(link) SESSION.commit() return SESSION.close()
def _check_if_contract_number_is_null(purchase_order_object): ''' Checks if this contract number is null. :params purchase_order_object: A PurchaseOrder object. :type purchase_order_object: A PurchaseOrder object. :returns: boolean. True if the contract number is null, False if not. ''' if len(purchase_order_object.data['contract number']) < 1: log.info('Not uploading purchase order %s to DocumentCloud', purchase_order_object.data['purchase order']) log.info('Contract number %s is null', purchase_order_object.data['contract number']) return True else: return False
def match_local_database_to_document_cloud(self): ''' Match our local database to our DocumentCloud project. TODO: Why fetching half-filled contracts? ''' half_filled_contracts = LensDatabase().get_half_filled_contracts() log.info('%d half-filled contracts need to be synced', len(half_filled_contracts)) for half_filled_contract in half_filled_contracts: try: contract = self.client.documents.get( half_filled_contract.doc_cloud_id) self._match_contract(contract) except Exception as e: log.error(e, exc_info=True)
def link(self, name, vendor): """Link the vendor to the company.""" name = name.strip("\n").replace(".", "").strip() # get the vendor: vendorindb = (SESSION.query(Vendor).filter( Vendor.name == vendor).first()) # get the person: personindb = (SESSION.query(Person).filter( Person.name == name).first()) co = (SESSION.query(Company).filter(Company.name == name)) companyindb = co.first() # get the company if personindb is not None and companyindb is None: link = (SESSION.query(VendorOfficer).filter( VendorOfficer.vendorid == vendorindb.id).filter( VendorOfficer.personid == personindb.id).count()) if vendorindb is not None and personindb is not None and link < 1: log.info("Linking {0} to {1}", str(vendorindb.id), str(personindb.id)) link = VendorOfficer(vendorindb.id, personindb.id) SESSION.add(link) SESSION.commit() return if companyindb is not None and personindb is None: link = (SESSION.query(VendorOfficerCompany).filter( VendorOfficerCompany.vendorid == vendorindb.id).filter( VendorOfficerCompany.companiesid == companyindb.id).count()) if vendorindb is not None and companyindb is not None and link < 1: print("Linking {0} to {1}".format(str(vendorindb.id), str(companyindb.id))) link = VendorOfficerCompany(vendorindb.id, companyindb.id) SESSION.add(link) SESSION.commit() return SESSION.close()
def _write_purchase_order(self, html, file_location): ''' This takes an individual contract page's HTML and writes it out to an HTML file in the proper location. :param html: The individual contract page's HTML. :type html: string. :param file_location: The path to where the file should be created. :type file_location: string. ''' if not os.path.exists(os.path.dirname(file_location)): os.makedirs(os.path.dirname(file_location)) with open(file_location, 'w') as filename: log.info('Saving HTML for purchase order %s', self.purchase_order_number) filename.write(html)
def match_local_database_to_document_cloud(self): ''' Match our local database to our DocumentCloud project. TODO: Why fetching half-filled contracts? ''' half_filled_contracts = LensDatabase().get_half_filled_contracts() log.info('%d half-filled contracts need to be synced', len(half_filled_contracts)) for half_filled_contract in half_filled_contracts: try: contract = self.client.documents.get( half_filled_contract.doc_cloud_id ) self._match_contract(contract) except Exception as e: log.error(e, exc_info=True)
def get_people_associated_with_vendor(self, name): """ Get a list of people associated with the vendor. Not called on by this class, but is called on by emailer.py. :param name: The vendor name. :type name: string :returns: list. The people who are associated with this vendor (how?). """ query = (SESSION.query( Person.name).filter(Vendor.id == VendorOfficer.vendorid).filter( Person.id == VendorOfficer.personid).filter( Vendor.name == name).all()) SESSION.close() log.info('%d people associated with %s', len(query), name) return [str(row[0]) for row in query]
def check_if_database_has_contract(self, purchase_order_number): """ Check if local database already has this contract. :param purchase_order_number: The unique ID in the city's website. :type purchase_order_number: string :returns: boolean. True if the contract is present, False if not. """ count = (SESSION.query(Contract).filter( Contract.purchaseordernumber == purchase_order_number).count()) SESSION.close() if count == 1: # Database has the contract log.info('DB contracts table already has purchase order %s', purchase_order_number) return True else: log.info('DB contracts table does not have purchase order %s', purchase_order_number) return False
def _download_vendor_profile(city_vendor_id): ''' Download the vendor page associated with a purchase order, if we don't have the vendor page already. :param city_vendor_id: The vendor ID on the city's purchasing site. :type city_vendor_id: string. ''' vendor_file_location = '%s/%s.html' % (VENDORS_DIR, city_vendor_id) if os.path.isfile(vendor_file_location): log.info('Already have HTML for vendor %s', city_vendor_id) else: try: response = urllib2.urlopen( 'http://www.purchasing.cityofno.com/' + 'bso/external/vendor/vendorProfileOrgInfo.sdo?' + 'external=true&vendorId={}'.format(city_vendor_id)) html = response.read() if not os.path.exists(os.path.dirname(vendor_file_location)): os.makedirs(os.path.dirname(vendor_file_location)) with open(vendor_file_location, 'w') as filename: log.info('Saving HTML for vendor %s', city_vendor_id) filename.write(html) except urllib2.HTTPError: log.info('Could not save HTML for vendor %s', city_vendor_id)
def _get_attachments(soup): ''' Find the attachments to download from the contract page HTML. :param soup: A BeautifulSoup object for the contract page's HTML. :type soup: BeautifulSoup object :returns: ??? ''' try: main_table = soup.select('.table-01').pop() metadatarow = (main_table.findChildren(['tr'])[2].findChildren( ['td'])[0].findChildren(['table'])[0].findChildren(['tr'])) attachment_filenames = (metadatarow[16].findChildren( ['td'])[1].findChildren(['a'])) return attachment_filenames except IndexError: log.info('No attachments found') return [] # The city does not always include attachment files.
def _download_attachment(self, attachment): ''' Download an attachment associated with a purchase order. :param attachment: The name of the attachment file to download. :type attachment: string ''' # The city's purchasing site has an internal ID for each attachment. # Here we use it to download the attachment files, and also to store # locally so we can have a list of the attachments we have on hand. city_attachment_id = re.search('[0-9]+', attachment.get('href')).group() log.debug('Gathering data for attachment %s', city_attachment_id) document_path = '%s/%s.pdf' % (DOCUMENTS_DIR, city_attachment_id) display_name = self._get_attachment_display_name(city_attachment_id) if os.path.isfile(document_path): # Have already downloaded log.info('Already have PDF for attachment %s', city_attachment_id) else: self._download_attachment_file(city_attachment_id, display_name, document_path)
def add_name(self, name): """TODO.""" name = name.replace(".", "").strip() if self.is_this_a_person(name): # people with Jr ect at the end of the name are people indb = (SESSION.query(Person) .filter(Person.name == name) .count()) if indb == 0: SESSION.add(Person(name)) SESSION.commit() return if indb == 1: SESSION.close() return if self._is_this_a_company(name): indb = (SESSION.query(Company) .filter(Company.name == name) .count()) if indb == 0: SESSION.add(Company(name)) SESSION.commit() return if indb == 1: SESSION.close() return log.info("Could not link %s", name) SESSION.close()
url = 'http://www.purchasing.cityofno.com/bso/' + \ 'external/advsearch/searchContract.sdo' req = urllib2.Request(url=url, data=data) req.add_header('Pragma', ' no-cache') req.add_header('Origin', 'http://www.purchasing.cityofno.com') req.add_header('Accept-Encoding', 'gzip, deflate') req.add_header('Content-Type', 'application/x-www-form-urlencoded') req.add_header( 'Accept', 'text/add_contracthtml,application/xhtml+xml,application/xml;' + 'q=0.9,image/webp,*/*;q=0.8') req.add_header('Cache-Control', 'no-cache') req.add_header( 'Referer', 'http://www.purchasing.cityofno.com/bso/external/advsearch/' + 'searchContract.sdo') req.add_header('Connection', 'keep-alive') req.add_header('DNT', '1') response = urllib2.urlopen(req) output = response.read() response.close() return output if __name__ == '__main__': log.info("Checking the city's purchasing site for new contracts") CheckCity().check_pages()
'external/advsearch/searchContract.sdo' req = urllib2.Request(url=url, data=data) req.add_header('Pragma', ' no-cache') req.add_header('Origin', 'http://www.purchasing.cityofno.com') req.add_header('Accept-Encoding', 'gzip, deflate') req.add_header('Content-Type', 'application/x-www-form-urlencoded') req.add_header( 'Accept', 'text/add_contracthtml,application/xhtml+xml,application/xml;' + 'q=0.9,image/webp,*/*;q=0.8' ) req.add_header('Cache-Control', 'no-cache') req.add_header( 'Referer', 'http://www.purchasing.cityofno.com/bso/external/advsearch/' + 'searchContract.sdo' ) req.add_header('Connection', 'keep-alive') req.add_header('DNT', '1') response = urllib2.urlopen(req) output = response.read() response.close() return output if __name__ == '__main__': log.info("Checking the city's purchasing site for new contracts") CheckCity().check_pages()