def parse_containerized(self, attachment, email_rpt_date): workbook = decrypt(attachment.body, password='******') for sheet in workbook.sheets(): start_processing = False if sheet.name.lower() in ('liquid bulk', 'dry Bulk', 'steel'): # store state of the table, in order to get relevant rows to extract for idx, raw_row in enumerate(sheet.get_rows()): # Include handling of xlrd.xldate.XLDateAmbiguous cases row = format_cells_in_row(raw_row, sheet.book.datemode) # skip irrelevant rows if 'vessel' in row[0].lower(): start_processing = True header = row continue if start_processing: raw_item = { may_strip(head.lower()): row[idx] for idx, head in enumerate(header) } # contextualise raw item with meta info raw_item.update( reported_date=email_rpt_date, provider_name=self.provider, port_name='Mombasa', ) if DataTypes.CargoMovement in self.produces: yield normalize_containerized.process_item(raw_item)
def parse_mail(self, mail): """parse 2 email attachments iron and coal for australia Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported date so it does not need to be called repeatedly later self.reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): if 'COAL' in attachment.name: sheet = xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheet_by_index(0) start_processing = None for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect if cell is a port cell and memoise it if 'Country' in row[0]: start_processing = True continue if start_processing: raw_item = { h: row[idx] for idx, h in enumerate(HEADERS) } raw_item.update(provider_name=self.provider, reported_date=self.reported_date) yield normalize.process_item(raw_item)
def parse_attachment(self, attachment, email_rpt_date, raw_sheet): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ start_processing = False for idx, raw_row in enumerate(raw_sheet.get_rows()): # Include handling of xlrd.xldate.XLDateAmbiguous cases row = format_cells_in_row(raw_row, raw_sheet.book.datemode) # skip irrelevant rows if 'date' in row[0].lower(): header = row start_processing = True continue if start_processing: raw_item = {may_strip(head.lower()): row[idx] for idx, head in enumerate(header)} # contextualise raw item with meta info raw_item.update(reported_date=email_rpt_date, provider_name=self.provider) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # get real reported date from attachment name date_match = re.search(r'(\d+(/|-)\d+(/|-)\d{2,4})', attachment.name) reported_date = date_match.group() if date_match else mail.envelope['date'] # some files have multiple sheets within them; extract all of them for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # only visible sheets are required if sheet.visibility == 0: for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # extract header row if 'Vessel Name' in row: header = row continue # remove empty rows if not row[0]: continue raw_item = {head: row[idx] for idx, head in enumerate(header)} raw_item.update( # we use the sheet tab name as the port name instead of listed port provider_name=self.provider, reported_date=reported_date, port_name=may_strip(sheet.name), ) yield from normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # ignore irrelevant rows if idx == 0: continue # extract header row, rpeort headers are inconsistent if 'date reported' in row[0].lower(): header = row continue raw_item = { head.lower(): row[idx] for idx, head in enumerate(header) } raw_item.update(provider_name=self.provider) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """parse 2 email attachments iron and coal for australia Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # to prevent the processing of the summary and useless sheets if sheet.name not in ['汇总', 'Sheet1', 'Sheet2']: for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect if cell is a port cell and memoise it if 'vessel' in row[2].lower(): headers = row continue raw_item = { h.lower(): row[idx] for idx, h in enumerate(headers) } raw_item.update(provider_name=self.provider, port_name=sheet.name) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # some files have multiple sheets within them; extract all of them for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # only visible sheets are required if sheet.visibility == 0: for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # extract header row if 'VESSEL' in row: header = row continue # remove empty rows if not (row[0] and row[1] and row[2]): continue raw_item = { head: row[idx] for idx, head in enumerate(header) } raw_item.update(provider_name=self.provider, ) yield from normalize.process_item(raw_item)
def parse_tanzania(self, attachment, email_rpt_date): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): start_processing = False # store state of the table, in order to get relevant rows to extract for idx, raw_row in enumerate(sheet.get_rows()): # Include handling of xlrd.xldate.XLDateAmbiguous cases row = format_cells_in_row(raw_row, sheet.book.datemode) # skip irrelevant rows if 'vessel name' in row[0].lower(): start_processing = True header = row continue if start_processing: raw_item = { may_strip(head.lower()): row[idx] for idx, head in enumerate(header) } # contextualise raw item with meta info raw_item.update( reported_date=email_rpt_date, provider_name=self.provider, port_name='Dar Es Salam', ) if DataTypes.CargoMovement in self.produces: yield from normalize_tanzania.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ # get reported date from email header and memoise to avoid calling it repeatedly below reported_date = mail.envelope['date'] for attachment in mail.attachments(): # only process relevant attachments (denoted by their names) if not any(sheet in attachment.name.lower() for sheet in self.relevant_files): continue # some files have multiple sheets within them; extract all of them for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # extract header row # NOTE sometimes the header will not contain the "PORT" key where it should be # "PORT key should be the first element in header if 'VESSEL' in row: header = row header[0] = 'PORT' header[5] = 'PRE. PORT' continue # remove empty rows and rows with "NIL" vessels if not row[3] or row[3] in ['NIL', 'TBN']: continue raw_item = { head: row[idx] for idx, head in enumerate(header) } raw_item.update( # we use the sheet tab name as the port name instead of listed port region_name=sheet.name, provider_name=self.provider, reported_date=reported_date, spider_name=self.name, ) if DataTypes.SpotCharter in self.produces: yield from normalize_charters.process_item(raw_item) # FIXME supposed to be `DataTypes.PortCall` here, but we don't want # data-dispatcher to consume data from these spiders and the ETL to create PCs else: yield from normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ self.reported_date = to_isoformat(mail.envelope['date']) start_process = False for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): if 'line-up' in sheet.name.lower(): for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # extract header row if 'VESSEL' in row[0].upper(): header = row start_process = True continue if 'LIST' in row[0].upper(): year = row[0].partition('(')[-1].partition(')')[0] start_process = False continue if start_process: # remove empty rows if not row[0]: continue raw_item = { head.upper(): row[idx] for idx, head in enumerate(header) } raw_item.update( provider_name=self.provider, reported_date=self.reported_date, port_name=sheet.name, year=year, ) if DataTypes.SpotCharter in self.produces: yield normalize_charters.process_item(raw_item) else: yield from normalize_grades.process_item( raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ # get hypothetical reported date from email header reported_date = mail.envelope['date'] for attachment in mail.attachments(): # some files have multiple sheets within them; extract all of them for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # only visible sheets are required if sheet.visibility == 0: for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # extract header row if 'VESSEL' in row: header = row continue # extract real reported date if 'Date:' in row: reported_date = row[2] continue # remove empty rows and rows with "NIL" vessels if not row[3] or row[3] in ['NIL', 'TBN']: continue raw_item = { head: row[idx] for idx, head in enumerate(header) } raw_item.update( # we use the sheet tab name as the port name instead of listed port region_name=sheet.name, provider_name=self.provider, reported_date=reported_date, spider_name=self.name, port_name=may_strip(sheet.name.lower()), ) if DataTypes.SpotCharter in self.produces: yield from normalize_charters.process_item( raw_item) # FIXME supposed to be `DataTypes.PortCall` here, but we don't want # data-dispatcher to consume data from these spiders and the ETL # to create PCs elif DataTypes.Cargo in self.produces: yield from normalize_grades.process_item(raw_item)
def parse_weekly(self, response, e_body=None, email_rpt_date=None): # get appropriate response from link or web if response: info_body = response info_body_name = str(info_body) info_rpt_date = info_body.meta.get('reported_date') else: info_body = e_body info_body_name = info_body.name info_rpt_date = email_rpt_date # port names are fixed and appended above each section if 'iron' in info_body_name.lower(): PORTS = IRON_PORTS COMMODITY = 'iron' if 'coal' in info_body_name.lower(): PORTS = COAL_PORTS COMMODITY = 'coal' for sheet in xlrd.open_workbook(file_contents=info_body.body, on_demand=True).sheets(): if sheet.name.isdigit() or sheet.name.lower() in ('vessel line up'): header = None port_name = None installation = None for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect if cell is a port cell and memoise it if row[0].lower() in PORTS: port_name = PORTS[row[0].lower()][0] installation = PORTS[row[0].lower()][1] # detect header row if row[0].lower() == 'vessel': header = row continue if header: raw_item = {h.lower(): row[idx] for idx, h in enumerate(header)} raw_item.update( provider_name=self.provider, reported_date=info_rpt_date, port_name=port_name, installation=installation, file_name=COMMODITY, ) yield normalize.process_item(raw_item)
def parse_raw_sheet(self, sheet, reported_date): """parse excel sheet Args: sheet (Sheet): Yields: Dict[str, str] """ start_processing = False tmp_row = [] # get product and the possible port name from the sheet name itself product, possible_ports = self.get_port_and_product(sheet.name) for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # to detect the start of the vessel table if len(row) > 0 and row[0].lower() == 'vessel': headers = row start_processing = True port_name, installation = self.detect_installation_and_port( tmp_row, possible_ports) # push the first element to the last. To handle sheets having multiple vessel table. if possible_ports and port_name != possible_ports[0]: possible_ports = possible_ports[1:] + possible_ports[:1] continue if start_processing: raw_item = { h.lower(): row[idx] for idx, h in enumerate(headers) } raw_item.update( provider_name=self.provider, reported_date=reported_date, port_name=port_name, cargo_product=product, installation=installation, ) yield normalize.process_item(raw_item) # memoizing the last row scrapped tmp_row.append(row)
def parse_mail(self, mail): """The method will be called for every mail the search_term matched. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise and use received timestamp of email as default reported date reported_date = parse_date(mail.envelope['date']).strftime('%d %b %Y') for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): # only process the sheet we want if sheet.name.lower() in IRRELEVANT_SHEET_NAME: continue # take reported date in sheet name if it contains one, else use default reported_date = self.parse_reported_date(attachment.name) or reported_date for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # first row is useless, discard it if idx == 0: continue # second row will always contain header; extract it if idx == 1: header = row continue # extract data row if row[VESSEL_COL_IDX] and row[VESSEL_COL_IDX] != 'TBN': raw_item = {head: row[head_idx] for head_idx, head in enumerate(header)} raw_item.update(reported_date=reported_date, provider_name=self.provider) if DataTypes.SpotCharter in self.produces: yield normalize_charters.process_item(raw_item) # FIXME supposed to be `DataTypes.PortCall` here, but we don't want # data-dispatcher to consume data from these spiders # and the ETL to create PCs else: yield from normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): # each xlsx file by this provider will only have one sheet sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True ).sheet_by_index(0) self.port_name = mail.envelope['subject'] for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # get reported date if idx == 1: self.reported_date = row[12] # remove empty filler rows before and after the main data table # remove unnecessary rows if idx < HEADER_ROW or row[0] == 'TERMINAL POSTING': continue # initialise headers if idx == HEADER_ROW: header = row continue raw_item = {head: row[idx] for idx, head in enumerate(header) if head} # contextualise raw item with metadata raw_item.update( provider_name=self.provider, reported_date=self.reported_date, port_name=self.port_name, ) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Extract mail found with specified email filters in spider arguments. Args: mail (Mail): Yields: Dict[str, str]: """ self.reported_date = to_isoformat(mail.envelope['date']) for attachment in mail.attachments(): for file_in_mail in self.relevant_files: sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True ).sheet_by_name(file_in_mail) for raw_row in sheet.get_rows(): row = format_cells_in_row(raw_row, sheet.book.datemode) # ignore irrelevant rows if not row[0]: continue # extract header row, rpeort headers are inconsistent if 'PORT' in row[0].upper(): header = row continue raw_item = {head.upper(): row[idx] for idx, head in enumerate(header)} raw_item.update( provider_name=self.provider, reported_date=self.reported_date, sheet_name=sheet.name, spider_name=self.name, ) if DataTypes.SpotCharter in self.produces: yield normalize_charters.process_item(raw_item) else: yield normalize_grades.process_item(raw_item)
def parse_mail(self, mail): """Extract data from each mail matched by the query spider argument. Args: mail (Mail): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly later reported_date = to_isoformat(mail.envelope['date']) start_processing = False for attachment in mail.attachments(): sheet = xlrd.open_workbook( file_contents=attachment.body, on_demand=True ).sheet_by_index(0) # store state of the table, in order to get relevant rows to extract for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # skip first row if 'date' in row[1].lower(): start_processing = True header = row continue if start_processing: raw_item = {may_strip(head): row[idx] for idx, head in enumerate(header)} # contextualise raw item with meta info raw_item.update( reported_date=reported_date, provider_name=self.provider, # source is for Fujairah, FOTT Terminal port_name='Fujairah', installation='Fujairah Oil Tanker Terminals', ) yield from normalize.process_item(raw_item)
def parse_mail(self, mail): """parse email Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): start_processing = False reported_date = None port = None for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) if 'coal / coke vessel line up' in ''.join(row).lower(): reported_date = may_strip(''.join(row).lower()) continue # detect if cell is a port cell and memoise it if 'position' in row[0].lower(): headers = row start_processing = True continue if start_processing: # determine port row if row.count('') >= 10: port = row[0] continue raw_item = { h.lower(): row[idx] for idx, h in enumerate(headers) } raw_item.update(provider_name=self.provider, port_name=port, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse_xls_report(self, response, e_body=None, email_rpt_date=None): # get appropriate response from link or web if response: info_body = response info_rpt_date = info_body.meta.get('reported_date') else: info_body = e_body info_rpt_date = email_rpt_date for sheet in xlrd.open_workbook(file_contents=info_body.body, on_demand=True).sheets(): processing = False for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect header row and rows to process if 'country' in row[0].lower(): header = row processing = True continue if processing: raw_item = {h.lower(): row[idx] for idx, h in enumerate(header)} raw_item.update(provider_name=self.provider, reported_date=info_rpt_date) yield normalize.process_item(raw_item)
def parse_mail(self, mail): """Parse each email that was matched with the spider filter arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): start_processing = False for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect if cell is a port cell and memoise it if any(may_strip(x.lower()) in HEADER_STR for x in (row[0], row[1], row[3])): header = row start_processing = True continue if start_processing: raw_item = { may_strip(h.lower()): may_strip(row[idx].replace('\n', '|').lower()) for idx, h in enumerate(header) } # occasionally the header might be missing an important header if '' in [key for key in raw_item.keys()]: raw_item['eta_holder'] = raw_item.pop('', None) raw_item.update( provider_name=self.provider, reported_date=mail.envelope['subject'], port_name=may_strip(sheet.name.lower()), ) yield from normalize.process_item(raw_item)