예제 #1
0
def normalize_product_volume(raw_product, raw_volume):
    """Normalize raw laycan date.

    Raw laycan inputs can be of the following formats:
        1) range: '02-03 SEP 2019'

    Args:
        raw_laycan (str):
        year (str | int): string numeric of report's year

    Returns:
        Tuple[str]: tuple of laycan period

    Examples:
        >>> normalize_product_volume('SOKOL CRUDE OIL', '600kb')
        [('SOKOL CRUDE OIL', '600', 'kilobarrel')]
        >>> normalize_product_volume('SOKOL CRUDE OIL', '600000')
        [('SOKOL CRUDE OIL', '600000', 'barrel')]
        >>> normalize_product_volume('Amna/sirtica', '952972/904732')
        [('Amna', '952972', 'barrel'), ('sirtica', '904732', 'barrel')]
        >>> normalize_product_volume('WTI/E45', '2036690')
        [('WTI', '1018345.0', 'barrel'), ('E45', '1018345.0', 'barrel')]
        >>> normalize_product_volume('WTI/E45', '2036kb')
        [('WTI', '1018.0', 'kilobarrel'), ('E45', '1018.0', 'kilobarrel')]
    """
    product_list = re.split(r'[\/\+]', raw_product)
    volume_list = re.split(r'[\/\+]', raw_volume)

    f_list = []
    if len(product_list) == len(volume_list):
        _list = list(zip(product_list, volume_list))
        for _val in _list:
            vol_unit = re.match(r'(\d+)(kb|bbl|bbls|bblss)', _val[1])
            if not vol_unit:
                vol_unit = (_val[1], Unit.barrel)
            else:
                vol_unit = vol_unit.groups()
            f_list.append((_val[0], vol_unit[0],
                           UNIT_MAPPING.get(vol_unit[1], vol_unit[1])))

    if len(product_list) > 1 and len(volume_list) == 1:
        _list = list(zip_longest(product_list, volume_list))
        vol_unit_match = re.match(r'(\d+)(kb|bbl|bbls|bblss)', _list[0][1])
        if vol_unit_match:
            _vol = vol_unit_match.group(1)
            _unit = UNIT_MAPPING.get(vol_unit_match.group(2),
                                     vol_unit_match.group(2))
        else:
            _vol = _list[0][1]
            _unit = Unit.barrel

        if is_number(_vol):
            vol = int(_vol) / len(product_list)
        else:
            vol = None

        for _prod in product_list:
            f_list.append((_prod, str(vol), _unit))

    return f_list
예제 #2
0
def normalize_prod_vol_move(raw_product, raw_volume_movement,
                            raw_opt_movement):
    """split products using common seperator after being normalized
    in split_product_vol

    Args:
        raw_product (str):
        raw_volume_movement (str):

    Examples:
        >>> normalize_prod_vol_move('butane', '3000', 'd')
        [['butane', '3000', 'd']]
        >>> normalize_prod_vol_move('butane/propane', '3000', 'd')
        [['butane', 1500.0, 'd'], ['propane', 1500.0, 'd']]
        >>> normalize_prod_vol_move('butane/propane', '3000/3000', 'd')
        [['butane', '3000', 'd'], ['propane', '3000', 'd']]
        >>> normalize_prod_vol_move('propane', '6500 mt (d)', None)
        [['propane', '6500', 'd']]
        >>> normalize_prod_vol_move('toluene + mx', '3133+ 950 mt (d)', None)
        [['toluene ', '3133', ''], [' mx', '950', 'd']]
        >>> normalize_prod_vol_move('toluene + mx', '5694 mt (d) + 3640 mt (d)', None)
        [['toluene ', '5694', 'd'], [' mx', '3640', 'd']]

    Returns:
        Tuple[str, str]:
    """
    raw_volume_movement = raw_volume_movement.replace(',', '')
    product_list = re.split(r'[\\/,\&\+\|]', raw_product)
    volume_movement_list = re.split(r'[\\/,\&\+\|]', raw_volume_movement)
    volume_list, movement_list = extract_volume_movement(volume_movement_list)

    # overwrite movement list if the source has a movement col
    if raw_opt_movement:
        movement_list = [
            may_strip(raw_opt_movement) for mov_item in product_list
        ]

    if len(product_list) == len(volume_list):
        zipped_list = [
            list(a) for a in list(
                zip_longest(product_list, volume_list, movement_list))
        ]
        return zipped_list

    if len(product_list) != len(volume_movement_list) and len(
            volume_list) == 1:
        if is_number(volume_list[0]):
            volume = [
                float(volume_list[0]) / 2 for i in range(0, len(product_list))
            ]
        else:
            volume = []
        zipped_list = [
            list(a)
            for a in list(zip_longest(product_list, volume, movement_list))
        ]
        return zipped_list

    return None
예제 #3
0
def validate_weight(raw_weight: str) -> Optional[str]:
    """validate negative or 0 weights

    Args:
        raw_weight (str):

    Examples:
        >>> validate_weight('')
        >>> validate_weight('0')
        >>> validate_weight('10000')
        '10000'

    """
    return str(raw_weight
               ) if is_number(raw_weight) and float(raw_weight) > 0 else None
예제 #4
0
def normalize_date(
    raw_eta: str,
    raw_berthed_day: str,
    raw_berthed_month: str,
    raw_berthed_time: str,
    reported_date: str,
) -> Tuple[str, str]:
    """Normalize eta and berthing dates

    Some examples of possible input combinations:
        - 'raw_eta': '01-Feb'
        - 'raw_berthed_day': '01-Mar-20'
        - 'raw_berthed_day': '01'
        - 'raw_berthed_time': '16:00'

    Examples:
        >>> normalize_date('01-Feb', '01', None, '17:00', '2020-02-01T00:00:00')
        ('2020-02-01T17:00:00', '2020-02-01T00:00:00')
        >>> normalize_date('01-Feb', '01', None, '17:00', '2020-02-01T00:00:00')
        ('2020-02-01T17:00:00', '2020-02-01T00:00:00')
        >>> normalize_date('01-Feb', '01-Mar-20', '01-Mar-20', '17:00', '2020-02-01T00:00:00')
        ('2020-03-01T17:00:00', '2020-02-01T00:00:00')
        >>> normalize_date('01-Feb', '01', '01-Mar-20', '17:00', '2020-02-01T00:00:00')
        ('2020-03-01T17:00:00', '2020-02-01T00:00:00')
    """
    # check if berthed month contains a month else discard
    if len(str(raw_berthed_month).split('-')) == 3:
        _day, _month, _year = raw_berthed_month.split('-')
        _year = f'20{_year}'
    else:
        _month = parse_date(reported_date).month
        _year = parse_date(reported_date).year

    if is_number(raw_berthed_day):
        _day = raw_berthed_day

    return (
        parse_date(
            f'{_year}-{_month}-{_day} {raw_berthed_time}:00').isoformat(),
        parse_date(f'{raw_eta}-{_year}', dayfirst=True).isoformat(),
    )
예제 #5
0
def extract_volume_movement(vol_movement):
    """Normalize dates

    Args:
        raw_product (str):
        raw_volume_movement (str):

    Examples:
        >>> extract_volume_movement(['3000'])
        (['3000'], [''])
        >>> extract_volume_movement(['3640 mt (d)', '3640 mt (d)'])
        (['3640', '3640'], ['d', 'd'])
        >>> extract_volume_movement(['3640 mt', '3640 mt (d)'])
        (['3640', '3640'], ['', 'd'])
        >>> extract_volume_movement(['3640 mt'])
        (['3640'], [''])

    Returns:
        Tuple[str, str]:
    """
    movement_list = []
    vol_list = []
    for vm in vol_movement:
        if is_number(may_strip(vm)):
            vol_list.append(may_strip(vm))
            movement_list.append('')
        else:
            try:
                separated_vol_movement = vm.split('(')
                movement_list.append(
                    re.sub(r'(\s|mt|\(|\))', '', separated_vol_movement[1]))
                vol_list.append(
                    re.sub(r'(\s|mt|\(|\))', '', separated_vol_movement[0]))
            except Exception:
                vol_list.append(re.sub(r'(\s|mt|\(|\))', '', vm))
                movement_list.append('')

    return vol_list, movement_list
예제 #6
0
def process_item(raw_item):
    """Transform raw item into a usable event.

    Args:
        raw_item (Dict[str, str]):

    Returns:
        Dict[str, str]: normalized cargo movement item

    """
    item = map_keys(raw_item, field_mapping())
    print(item)

    # normalize berth strings
    if item.get('berth'):
        item['berth'] = item['berth'].replace('.', '')

    if item.get('vessel_name_length'):
        v_name, v_loa, v_beam = split_name_loa_beam(
            item.pop('vessel_name_length', None))
        item['vessel'] = {
            'name': v_name,
            'length': int(float(v_loa)) if is_number(v_loa) else None,
            'beam': int(float(v_beam)) if is_number(v_beam) else None,
        }
    else:
        _loa = item.pop('vessel_length', None)
        item['vessel'] = {
            'name': normalize_vessel(item.pop('vessel_name', None)),
            'length': int(float(_loa)) if is_number(_loa) else None,
        }

    # for kandla attachment, the remarks column does not reflect player information but
    # the berth information
    if 'kandla' in item['port_name']:
        item.pop('cargo_player', None)

    if not item['vessel']['name']:
        return

    # handle kandla attachment where berth dates potentially could be in 5 columns
    for berthed_col in ('berthed_1', 'berthed_2', 'berthed_3', 'berthed_4',
                        'berthed_5'):
        if item.get(berthed_col) and item[berthed_col]:
            item['berthed'] = item.get(berthed_col)
            item.pop(berthed_col, None)
        item.pop(berthed_col, None)

    for date_col in ('eta', 'arrival', 'berthed', 'departure', 'eta_holder'):
        if item.get(date_col):
            item[date_col] = normalize_dates(item[date_col],
                                             item['reported_date'])
            continue
        else:
            item[date_col] = None
            continue
    if item.get('eta_holder'):
        item['eta'] = item.get('eta_holder', None)

    if (not item.get('eta') and not item.get('berthed')
            and not item.get('arrival') and not item.get('departure')):
        return

    cargo_products = None
    cargo_volume = None
    cargo_movement = None
    # handle normal cases where cargo and quantity are nicely seperated
    if item.get('cargo_product') and item.get('cargo_volume'):
        cargo_products = item.pop('cargo_product', None)
        cargo_volume = item.pop('cargo_volume', None)
        cargo_movement = item.pop('cargo_movement', None)

    # if cargo and volume are joint together
    if item.get('cargo_product_volume'):
        cargo_products, cargo_volume = split_product_vol(
            item.pop('cargo_product_volume', None))
        cargo_movement = item.pop('cargo_movement', None)

    if item.get('cargo_product') and not item.get('cargo_volume'):
        cargo_products, cargo_volume = split_product_vol(
            item.pop('cargo_product', None))
        cargo_movement = item.pop('cargo_movement', None)

    if not cargo_products:
        return

    zipped_cargo_list = normalize_prod_vol_move(cargo_products, cargo_volume,
                                                cargo_movement)
    seller = None
    buyer = None
    player = item.get('cargo_player') or item.get(
        'potential_cargo_player') or None
    for col in ('cargo_player', 'potential_cargo_player', 'eta_holder'):
        item.pop(col, None)
    if zipped_cargo_list:
        for zipped_item in zipped_cargo_list:
            movement = MOVEMENT_MAPPING.get(zipped_item[2], None)
            if movement == 'load':
                seller = player
            elif movement == 'discharge':
                buyer = player
            else:
                item.pop('cargo_player', None)
            item['cargo'] = {
                'product': may_strip(zipped_item[0]),
                'movement': movement if movement else None,
                'volume': str(zipped_item[1]) if zipped_item[1] else None,
                'volume_unit': Unit.tons,
                'buyer': {
                    'name': buyer
                } if buyer and buyer not in BLACKLIST else None,
                'seller': {
                    'name': seller
                } if seller and seller not in BLACKLIST else None,
            }

            if item['cargo']['product'] in BLACKLIST:
                continue

            yield item

    else:
        MISSING_ROWS.append(str(raw_item))
예제 #7
0
def normalize_dates(raw_date, rpt_date):
    """Normalize dates

    Args:
        vessel_name (str):

    Examples:
        >>> normalize_dates('1400 hrs 01.01.2020', '2020-01-01T00:00:00')
        '2020-01-01T14:00:00'
        >>> normalize_dates('2106 hrs/02.01.2020', '2020-01-01T00:00:00')
        '2020-01-02T21:06:00'
        >>> normalize_dates('2106 hrs /02.01.2020', '2020-01-01T00:00:00')
        '2020-01-02T21:06:00'
        >>> normalize_dates('02.01.2020(NOR)', '2020-01-01T00:00:00')
        '2020-01-02T00:00:00'
        >>> normalize_dates('02.01.2020', '2020-01-01T00:00:00')
        '2020-01-02T00:00:00'
        >>> normalize_dates('02.01.20', '2020-01-01T00:00:00')
        '2020-01-02T00:00:00'
        >>> normalize_dates('22.03.2020-1800', '2020-01-01T00:00:00')
        '2020-03-22T18:00:00'
        >>> normalize_dates('22.03.2020-AM', '2020-01-01T00:00:00')
        '2020-03-22T06:00:00'
        >>> normalize_dates(' am 02.01.2020', '2020-01-01T00:00:00')
        '2020-01-02T06:00:00'
        >>> normalize_dates('02.01.2020/0700 hrs', '2020-01-01T00:00:00')
        '2020-01-02T07:00:00'
        >>> normalize_dates('pm hrs 06.01.2020', '2020-01-01T00:00:00')
        '2020-01-06T15:00:00'
        >>> normalize_dates('26/01', '2020-01-01T00:00:00')
        '2020-01-26T00:00:00'
        >>> normalize_dates('26/01', '2019-12-30T00:00:00')
        '2020-01-26T00:00:00'

    Returns:
        str:

    """
    # normalize dates strings
    raw_date = raw_date.lower().replace('am', '0600/').replace('pm', '1500/')
    if is_isoformat(raw_date.upper()):
        return raw_date.upper()

    # if dd/mm data is provided, guess the year and return date
    if '/' in raw_date and len(raw_date.split('/')) == 2:
        if all(is_number(rd) for rd in raw_date.split('/')):
            potential_date, _ = get_date_range(raw_date, '/', '-', rpt_date)
            return potential_date

    # detect date and time fields
    date_hour = [
        may_strip(_d)
        for _d in re.split(r'(hrs /|hrs/|/ hrs|hrs|/|\()|\-', raw_date) if _d
    ]

    _date, _time = None, ''
    for dh in date_hour:
        if is_number(dh):
            _time = dh
            continue
        if len(dh.split('.')) == 3:
            _date = dh
            continue

    if not _date:
        return None

    try:
        return to_isoformat(may_strip(f'{_date} {_time}'), dayfirst=True)
    except Exception:
        return None
예제 #8
0
    def parse_pdf(self, response):
        """Extract data from report,  606-ETA is one section consisting of 3 columns, therefore
        the increment is 3. There are 13 sections in total to process. Hence, the final area
        of interest is 41

        Date | Time  | Day | 606                      | LOA |  ETA    | 607 | LOA | ETA | ...
        ---------------------------------------------------------------------------------------
        5    | 06:00 | Wed | Ultra Wollongong = 10000 | 200 |  01-Feb |
             | 14:00 |     | Ferro Chrome             |     |         |
             | 22:00 |     | Ferro Chrome             |     |         |

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        table = self.extract_pdf_io(response.body, **self.tabula_options)
        start_processing = False
        start_index_of_interest = 3
        final_index_of_interest = 41
        prev_vessel_item = None
        cargo_list = []
        while start_index_of_interest < final_index_of_interest:
            prev_month_item = None
            for idx, row in enumerate(table):
                # detect relevant row to start processing
                if 'TimeDay' in ''.join(row):
                    start_processing = True
                    continue

                if not start_processing:
                    continue

                # memoise day as subsequent cells are empty until the next day cell is
                # filled
                if row[0]:
                    prev_day_item = row[0]
                    prev_month_item = (
                        row[0] if len(str(row[0]).split('-')) == 3 else prev_month_item
                    )

                batch = row[start_index_of_interest : start_index_of_interest + 3]

                # detect vessel row when the loa column is filled with appropriate number
                if is_number(may_strip(batch[1])):
                    # memoise vessel details
                    vessel_item = {
                        'vessel_name': batch[0],
                        'vessel_loa': batch[1],
                        'eta': batch[2],
                        'berthed_day': prev_day_item if not row[0] else row[0],
                        'berthed_month': prev_month_item,
                        'berthed_time': row[1],
                        'port_name': 'Richards Bay',
                        'provider_name': self.provider,
                        'reported_date': parse_date(
                            may_strip(self.reported_date), dayfirst=True
                        ).isoformat(),
                    }
                    cargo_list = []
                    continue

                # cargo appears after the vessel row is identified
                # append until the next vessel row is detected
                cargo_list.append(batch[0])
                vessel_item.update(cargo_list=cargo_list,)

                # once the last index is reached, restart the processing
                if idx == len(table) - 1:
                    start_processing = False

                # yield item once cargo list is complete
                if prev_vessel_item and (
                    vessel_item.get('vessel_name') != prev_vessel_item.get('vessel_name')
                ):
                    yield normalize.process_item(prev_vessel_item)

                # since this is based on previous and current item comparison to yield,
                # we need to force yield the last item in the last section since there
                # is nothing to compare
                if start_index_of_interest >= 39 and idx == len(table) - 1:
                    yield normalize.process_item(vessel_item)

                prev_vessel_item = vessel_item

            # process next batch in the table
            start_index_of_interest += 3