Python concord_data 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyeconlab.util

메소드/함수: concord_data

hotexamples.com에서의 예제들: 9

Python concord_data - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyeconlab.util.concord_data에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: dataset_cp.py 프로젝트: amrscs/ps

    def country_aggregates(self, members, issue_error='.', verbose=True):
        """
        Aggregate Qualifying Countries from Members Dictionary

        Parameters
        ----------
        members     :   dict
                        Provide Country Aggregate Dictionary
        issue_error :   bool or str, optional(default='.')
                        Issue error for concord_data

        """
        df = self.data.reset_index()
        for cntry in members.keys():
            try:
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(members, x, issue_error=issue_error))  #issue_error = false returns x if no match
            except:
                pass #-eiso3c not found-#
            try:
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(members, x, issue_error=issue_error))  #issue_error = false returns x if no match
            except:
                pass #-iiso3c not found-#
        #-Collapse Items-#
        idx = ['year']
        for item in ['eiso3c', 'iiso3c']:
            if item in df.columns:
                idx.append(item)
        idx.append('productcode')
        if verbose: print "[INFO] Collapsing on index: %s" % idx
        df = df.groupby(idx).sum()
        self.set_data(df, force=True)

예제 #2

파일 보기

파일: constructor_dataset_sitcr2l3.py 프로젝트: amrscs/ps

def construct_sitcr2l3(df, data_type, dropAX=True, sitcr2=True, drop_nonsitcr2=True, adjust_hk=(False, None), intertemp_cntrycode=False, drop_incp_cntrycode=False, adjust_units=False, source_institution='un', verbose=True):
        """
        Construct a Self Contained (SC) Direct Action Dataset for Countries at the SITC Revision 2 Level 3
        
        There are no checks on the incoming dataframe to ensure data integrity.
        This is your responsibility

        STATUS: tests/test_constructor_dataset_sitcr2l3.py

        Parameters
        ----------
        df                  :   DataFrame
                                Pandas DataFrame containing the raw data
        data_type           :   str
                                Specify what type of data 'trade', 'export', 'import'
        dropAX              :   bool, optional(default=True)
                                Drop AX Codes 
        sitcr2              :   bool, optional(default=True)
                                Add SITCR2 Indicator
        drop_nonsitcr2      :   bool, optional(default=True)
                                Drop non-standard SITC2 Codes
        adjust_hk           :   Tuple(bool, df), optional(default=(False, None))
                                Adjust the Hong Kong Data using NBER supplemental files which needs to be supplied as a dataframe
        intertemp_cntrycode :   bool, optional(default=False)
                                Generate Intertemporal Consistent Country Units (from meta)
        drop_incp_cntrycode :   bool, optional(default=False)
                                Drop Incomplete Country Codes (from meta)
        adjust_units        :   bool, optional(default=False)
                                Adjust units by a factor of 1000 to specify in $'s
        source_institution  :   str, optional(default='un')
                                which institutions SITC classification to use

        Notes
        -----
        1. Operations ::

            [1] Adjust Hong Kong and China Data
            [2] Drop SITC4 to SITC3 Level (for greater intertemporal consistency)
            [3] Import ISO3C Codes as Country Codes
            [4] Drop Errors in SITC3 codes ["" Codes]
            
            Optional:
            ---------
            [A] Drop sitc3 codes that contain 'A' and 'X' codes [Default: True]
            [B] Drop Non-Standard SITC3 Codes [i.e. Aren't in the Classification]
            [C] Adjust iiso3c, eiso3c country codes to be intertemporally consistent
            [D] Drop countries with incomplete data across 1962 to 2000 (strict measure)
  

        3. This makes use of countryname_to_iso3c in the meta data subpackage
        4. This method can be tested using /do/basic_sitc3_country_data.do
        5. DropAX + Drop NonStandard SITC Rev 2 Codes still contains ~94-96% of the data found in the raw data

        ..  Future Work
            -----------
            1. Check SITC Revision 2 Official Codes
            2. Add in a Year Filter
        """

        #-Operations Requiring RAW SITC Level 4-#
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        idx = [u'year', u'icode', u'importer', u'ecode', u'exporter', u'sitc4', u'unit', u'dot']

        #-Hong Kong China Data Adjustment Option-#
        if type(adjust_hk) == bool:
            adjust_hk = (adjust_hk, None)
        if adjust_hk[0]:
            if verbose: print "[INFO] Adjusting Hong Kong and China Values"
            hkdata = adjust_hk[1]
            #-Values-#
            raw_value = df[idx+['value']].rename(columns={'value' : 'value_raw'})
            try:
                adjust_value = hkdata[idx+['value_adj']]
            except:
                raise ValueError("[ERROR] China/Hong Kong Data has not been passed in properly!")
            #-Note: Current merge_columns utility merges one column set at a time-#
            df = merge_columns(raw_value, adjust_value, idx, collapse_columns=('value_raw', 'value_adj', 'value'), dominant='right', output='final', verbose=verbose)
            #-Note: Adjust Quantity has not been implemented. See NBERWTF constructor -#

        #-Filter Data-#
        idx = ['year', 'exporter', 'importer', 'sitc4']
        df = df.loc[:, idx + ['value']]

        #-Adjust to SITC Level 3-#
        if verbose: print "[INFO] Collapsing to SITC Level 3 Data"
        df['sitc3'] = df.sitc4.apply(lambda x: x[0:3])
        df = df.groupby(['year', 'exporter', 'importer', 'sitc3']).sum()['value'].reset_index()
        
        #-Operations at SITC Level 3-#
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        #-Countries Only Adjustment-#
        if verbose: print "[INFO] Removing 'World' values from the dataset to be country only data"
        df = df.loc[(df.exporter != "World") & (df.importer != "World")]
        
        #-Add Country ISO Information-#
        #-Exports (can include NES on importer side)-#
        if data_type == 'export' or data_type == 'exports':
            if verbose: print "[INFO] Adding eiso3c using nber meta data"
            df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.eiso3c != '.')]
            df = df.groupby(['year', 'eiso3c', 'sitc3']).sum()['value'].reset_index()
        #-Imports (can include NES on importer side)-#
        elif data_type == 'import' or data_type == 'imports':
            if verbose: print "[INFO] Adding iiso3c using nber meta data"
            df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.iiso3c != '.')]
            df = df.groupby(['year','iiso3c', 'sitc3']).sum()['value'].reset_index()
        #-Trade-#
        else: 
            if verbose: print "[INFO] Adding eiso3c and iiso3c using nber meta data"
            df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
            df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.iiso3c != '.') & (df.eiso3c != '.')]
            df = df.groupby(['year', 'eiso3c', 'iiso3c', 'sitc3']).sum()['value'].reset_index()
        
        #-Remove Product Code Errors in Dataset-#
        df = df.loc[(df.sitc3 != "")]                                                                   #Does this need a reset_index?
        #-dropAX-#
        if dropAX:
            if verbose: print "[INFO] Dropping SITC Codes with 'A' or 'X'"
            df['AX'] = df.sitc3.apply(lambda x: 1 if re.search("[AX]", x) else 0)
            df = df.loc[df.AX != 1]
            del df['AX']               #No Longer Required
        
        #-Official SITCR2 Codes-#
        if sitcr2:
            if verbose: print "[INFO] Adding SITCR2 Indicator"
            sitc = SITC(revision=2, source_institution=source_institution)
            codes = sitc.get_codes(level=3)
            df['sitcr2'] = df['sitc3'].apply(lambda x: 1 if x in codes else 0)
            if drop_nonsitcr2:
                if verbose: print "[INFO] Dropping Non Standard SITCR2 Codes"
                df = df.loc[(df.sitcr2 == 1)]
                del df['sitcr2']                #No Longer Needed
        
        #-Adjust Country Codes to be Intertemporally Consistent-#
        if intertemp_cntrycode:
            #-Export-#
            if data_type == 'export' or data_type == 'exports':
                if verbose: print "[INFO] Imposing dynamically consistent eiso3c recodes across 1962-2000"
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['eiso3c'] != '.']
                df = df.groupby(['year', 'eiso3c', 'sitc3']).sum().reset_index()
            #-Import-#
            elif data_type == 'import' or data_type == 'imports':
                if verbose: print "[INFO] Imposing dynamically consistent iiso3c recodes across 1962-2000"
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df.groupby(['year', 'iiso3c', 'sitc3']).sum().reset_index()
            #-Trade-#
            else:
                if verbose: print "[INFO] Imposing dynamically consistent iiso3c and eiso3c recodes across 1962-2000"
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df[df['eiso3c'] != '.']
                df = df.groupby(['year', 'eiso3c', 'iiso3c', 'sitc3']).sum().reset_index()
        
        #-Drop Incomplete Country Codes-#
        if drop_incp_cntrycode:
            if verbose: print "[INFO] Dropping countries with incomplete data across 1962-2000"
            #-Export-#
            if data_type == 'export' or data_type == 'exports':
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['eiso3c'] != '.']
            #-Import-#
            elif data_type == 'import' or data_type == 'imports':
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
            #-Trade-#
            else:
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df[df['eiso3c'] != '.']
            df = df.reset_index()
            del df['index']
       
        #-Adjust Units from 1000's to $'s-#
        if adjust_units:
            if verbose: print "[INFO] Adjusting 'value' units to $'s"
            df['value'] = df['value']*1000         #Default: Keep in 1000's
        
        #-Return Dataset-#
        if verbose: print "[INFO] Finished Computing Dataset (%s) ..." % (data_type) 
        return df

예제 #3

파일 보기

파일: dataset_construct_baci.py 프로젝트: amrscs/ps

    world_values = rawdata[["year", "value"]].groupby(["year"]).sum()
    store.put('World', world_values, format='table')
    store.close()
    del world_values
    gc.collect()

if RAW_COUNTRY_YEARLY:
    print
    print "---> COMPUTING COUNTRY YEARLY VALUES FROM RAW BACI DATASET <---"
    print
    #-Setup Store-#
    fn = "raw_baci_country_year-1998to2012.h5"
    store = pd.HDFStore(TARGET_DIR + fn, complevel=9, complib='zlib')
    #-Import ISO3C-#
    from pyeconlab.trade.dataset.CEPIIBACI.meta import hs96_iso3n_to_iso3c
    rawdata['eiso3c'] = rawdata['eiso3n'].apply(lambda x: concord_data(
        hs96_iso3n_to_iso3c, x, issue_error=np.nan))  #Is this Complete?
    rawdata['iiso3c'] = rawdata['iiso3n'].apply(lambda x: concord_data(
        hs96_iso3n_to_iso3c, x, issue_error=np.nan))  #Is this Complete?
    #-Country Exports-#
    exports = rawdata[["year", "eiso3c",
                       "value"]].groupby(["year",
                                          "eiso3c"]).sum().reset_index()
    store.put("CountryExports", exports, format='table')
    #-Country Imports-#
    imports = rawdata[["year", "iiso3c",
                       "value"]].groupby(["year",
                                          "iiso3c"]).sum().reset_index()
    store.put("CountryImports", imports, format='table')
    store.close()
    del exports
    del imports

예제 #4

파일 보기

파일: dataset_construct_baci.py 프로젝트: davidheart/econ-phdthesis

    world_values = rawdata[["year", "value"]].groupby(["year"]).sum()
    store.put('World', world_values, format='table')
    store.close()
    del world_values
    gc.collect()

if RAW_COUNTRY_YEARLY:
    print
    print "---> COMPUTING COUNTRY YEARLY VALUES FROM RAW BACI DATASET <---"
    print
    #-Setup Store-#
    fn = "raw_baci_country_year-1998to2012.h5"
    store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
    #-Import ISO3C-#
    from pyeconlab.trade.dataset.CEPIIBACI.meta import hs96_iso3n_to_iso3c
    rawdata['eiso3c'] = rawdata['eiso3n'].apply(lambda x: concord_data(hs96_iso3n_to_iso3c, x, issue_error=np.nan))     #Is this Complete?
    rawdata['iiso3c'] = rawdata['iiso3n'].apply(lambda x: concord_data(hs96_iso3n_to_iso3c, x, issue_error=np.nan))     #Is this Complete?
    #-Country Exports-#
    exports = rawdata[["year", "eiso3c", "value"]].groupby(["year", "eiso3c"]).sum().reset_index()
    store.put("CountryExports", exports, format='table')
    #-Country Imports-#
    imports = rawdata[["year", "iiso3c", "value"]].groupby(["year", "iiso3c"]).sum().reset_index()
    store.put("CountryImports", imports, format='table')
    store.close()
    del exports
    del imports
    gc.collect()

if RAW_PRODUCT_YEARLY:
    
    ## Shold this be filtered through a countries only filter? ##

예제 #5

파일 보기

파일: dataset_construct_nberbaci.py 프로젝트: davidheart/econ-phdthesis

def harmonise_data(
    df,
    data_type,
    level,
    intertemp_productcode=(False, None),
    intertemp_cntrycode=False,
    drop_incp_cntrycode=False,
    adjust_units=False,
    verbose=True,
):
    """
        Construct a Harmonised Dataset between NBER and BACI

        Parameters
        ----------
        df                  :   DataFrame
                                Pandas DataFrame containing the raw data
        data_type           :   str
                                Specify what type of data 'trade', 'export', 'import'
        level               :   int
                                Specify Level of Final dataset (i.e. SITC Level 1, 2, 3, or 4)
        intertemp_productcode : Tuple(bool, dict), optional(default=False, None)
                                Apply an Intertemporal Product Code System drop a conversion dictionary (IC["drop"] = [], IC["collapse"] = [])
                                Note this will override the drop_nonsitcr2 option
        intertemp_cntrycode :   bool, optional(default=False)
                                Generate Intertemporal Consistent Country Units (from meta)
        drop_incp_cntrycode :   bool, optional(default=False)
                                Drop Incomplete Country Codes (from meta)
        adjust_units        :   bool, optional(default=False)
                                Adjust units by a factor of 1000 to specify in $'s

        Notes
        -----
            1. This consists of code snippets from construct_dataset_sitcr2.py

        """

    # -Intertemporal ProductCodes-#
    if intertemp_productcode[0]:
        if verbose:
            print "[INFO] Computing Intertemporally Consistent ProductCodes ..."
        # -This Method relies on meta data computed by pyeconlab nberwtf constructor-#
        IC = intertemp_productcode[1]  # Dict("drop" and "collapse" code lists)
        # -Drop Codes-#
        drop_codes = IC["drop"]
        if verbose:
            print "Dropping the following productcodes ..."
            print drop_codes
        keep_codes = set(df["sitc%s" % level].unique()).difference(set(drop_codes))
        df = df.loc[df["sitc%s" % level].isin(keep_codes)].copy(deep=True)
        # -Collapse Codes-#
        collapse_codes = IC["collapse"]
        if verbose:
            print "Collapsing the following productcodes ..."
            print collapse_codes
        collapse_codes = {x[0 : level - 1] for x in collapse_codes}  # -Simplify Computations-#
        for code in collapse_codes:
            df["sitc%s" % level] = df["sitc%s" % level].apply(
                lambda x: code if x[0 : level - 1] == code else x
            )  # code+'0'
        # -Recodes-#
        recodes = IC["recode"]
        recode_codes = recodes.keys()
        if verbose:
            print "Recoding the following productcodes ..."
            print recode_codes
        for code in recode_codes:
            df["sitc%s" % level] = df["sitc%s" % level].apply(lambda x: recodes[x] if x in recode_codes else x)
        # -Reset Collapsed Codes-#
        df = df.groupby(list(df.columns.drop("value"))).sum()
        df = df.reset_index()

    # -Adjust Country Codes to be Intertemporally Consistent-#
    if intertemp_cntrycode:
        # -Export-#
        if data_type == "export" or data_type == "exports":
            if verbose:
                print "[INFO] Imposing dynamically consistent eiso3c recodes across 1962-2000"
            df["eiso3c"] = df["eiso3c"].apply(
                lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["eiso3c"] != "."]
            df = df.groupby(["year", "eiso3c", "sitc%s" % level]).sum().reset_index()
        # -Import-#
        elif data_type == "import" or data_type == "imports":
            if verbose:
                print "[INFO] Imposing dynamically consistent iiso3c recodes across 1962-2000"
            df["iiso3c"] = df["iiso3c"].apply(
                lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["iiso3c"] != "."]
            df = df.groupby(["year", "iiso3c", "sitc%s" % level]).sum().reset_index()
        # -Trade-#
        else:
            if verbose:
                print "[INFO] Imposing dynamically consistent iiso3c and eiso3c recodes across 1962-2000"
            df["iiso3c"] = df["iiso3c"].apply(
                lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df["eiso3c"] = df["eiso3c"].apply(
                lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["iiso3c"] != "."]
            df = df[df["eiso3c"] != "."]
            df = df.groupby(["year", "eiso3c", "iiso3c", "sitc%s" % level]).sum().reset_index()

    # -Drop Incomplete Country Codes-#
    if drop_incp_cntrycode:
        if verbose:
            print "[INFO] Dropping countries with incomplete data across 1962-2000"
        # -Export-#
        if data_type == "export" or data_type == "exports":
            df["eiso3c"] = df["eiso3c"].apply(
                lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["eiso3c"] != "."]
        # -Import-#
        elif data_type == "import" or data_type == "imports":
            df["iiso3c"] = df["iiso3c"].apply(
                lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["iiso3c"] != "."]
        # -Trade-#
        else:
            df["iiso3c"] = df["iiso3c"].apply(
                lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df["eiso3c"] = df["eiso3c"].apply(
                lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False)
            )  # issue_error = false returns x if no match
            df = df[df["iiso3c"] != "."]
            df = df[df["eiso3c"] != "."]
        df = df.reset_index()
        del df["index"]

    # -Adjust Units from 1000's to $'s-#
    if adjust_units:
        if verbose:
            print "[INFO] Adjusting 'value' units to $'s"
        df["value"] = df["value"] * 1000  # Default: Keep in 1000's

    # -Return Dataset-#
    if verbose:
        print "[INFO] Finished Computing Harmonised Dataset (%s) ..." % (data_type)
    return df

예제 #6

파일 보기

파일: constructor_dataset_sitcr2.py 프로젝트: davidheart/econ-phdthesis

def construct_sitcr2(df, data_type, level, AX=True, dropAX=True, sitcr2=True, drop_nonsitcr2=True, adjust_hk=(False, None), intertemp_productcode=(False, None), intertemp_cntrycode=False, drop_incp_cntrycode=False, adjust_units=False, source_institution='un', harmonised_raw=False, values_only=False, verbose=True):
        """
        Construct a Self Contained (SC) Direct Action Dataset for Countries at the SITC Revision 2 Level 3
        
        There are no checks on the incoming dataframe to ensure data integrity.
        This is your responsibility

        STATUS: tests/test_constructor_dataset_sitcr2l3.py

        Parameters
        ----------
        df                  :   DataFrame
                                Pandas DataFrame containing the raw data
        data_type           :   str
                                Specify what type of data 'trade', 'export', 'import'
        level               :   int
                                Specify Level of Final dataset (i.e. SITC Level 1, 2, 3, or 4)
        AX                  :   bool, optional(default=True)
                                Add a Marker for Codes that Include 'A' and 'X'
        dropAX              :   bool, optional(default=True)
                                Drop AX Codes at the Relevant Level (i.e. SITC Level 3 Data will include appropriate A and X codes)
        sitcr2              :   bool, optional(default=True)
                                Add SITCR2 Indicator
        drop_nonsitcr2      :   bool, optional(default=True)
                                Drop non-standard SITC2 Codes
        adjust_hk           :   Tuple(bool, df), optional(default=(False, None))
                                Adjust the Hong Kong Data using NBER supplemental files which needs to be supplied as a dataframe
        intertemp_productcode : Tuple(bool, dict), optional(default=False, None)
                                Apply an Intertemporal Product Code System drop a conversion dictionary (IC["drop"] = [], IC["collapse"] = [])
                                Note this will override the drop_nonsitcr2 option
        intertemp_cntrycode :   bool, optional(default=False)
                                Generate Intertemporal Consistent Country Units (from meta)
        drop_incp_cntrycode :   bool, optional(default=False)
                                Drop Incomplete Country Codes (from meta)
        adjust_units        :   bool, optional(default=False)
                                Adjust units by a factor of 1000 to specify in $'s
        source_institution  :   str, optional(default='un')
                                which institutions SITC classification to use
        harmonised_raw      :   bool, optional(default=False)
                                Return simple RAW dataset with Quantity disaggregation collapsed and eiso3c and iiso3c columns (Note: You may use hk_adjust with this option)
        values_only         :   bool, optional(default=False)
                                Return Values and Relevant Index Data Only (i.e. drop 'AX', 'sitcr2')

        Notes
        -----
        1. Operations ::

            [1] Adjust Hong Kong and China Data
            [2] Drop SITC4 to SITC3 Level (for greater intertemporal consistency)
            [3] Import ISO3C Codes as Country Codes
            [4] Drop Errors in SITC3 codes ["" Codes]
            
            Optional:
            ---------
            [A] Drop sitc3 codes that contain 'A' and 'X' codes [Default: True]
            [B] Drop Non-Standard SITC3 Codes [i.e. Aren't in the Classification]
            [C] Construct an Intertemporal Product Code Classification and Adjust Dataset
            [C] Adjust iiso3c, eiso3c country codes to be intertemporally consistent
            [D] Drop countries with incomplete 'total' data across 1962 to 2000 (strict measure) [Identification Debatable]
  

        3. This makes use of countryname_to_iso3c in the meta data subpackage
        4. This method can be tested using /do/basic_sitc3_country_data.do
        5. DropAX + Drop NonStandard SITC Rev 2 Codes still contains ~94-96% of the data found in the raw data

        ..  Future Work
            -----------
            1. Check SITC Revision 2 Official Codes
            2. Add in a Year Filter

        """
        
        #-Operations Requiring RAW SITC Level 4-#
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        idx = [u'year', u'icode', u'importer', u'ecode', u'exporter', u'sitc4', u'unit', u'dot']

        #-Hong Kong China Data Adjustment Option-#
        if type(adjust_hk) == bool:
            adjust_hk = (adjust_hk, None)
        if adjust_hk[0]:
            if verbose: print "[INFO] Adjusting Hong Kong and China Values"
            hkdata = adjust_hk[1]
            #-Values-#
            raw_value = df[idx+['value']].rename(columns={'value' : 'value_raw'})
            try:
                adjust_value = hkdata[idx+['value_adj']]
            except:
                raise ValueError("[ERROR] China/Hong Kong Data has not been passed in properly!")
            #-Note: Current merge_columns utility merges one column set at a time-# 
            df = merge_columns(raw_value, adjust_value, idx, collapse_columns=('value_raw', 'value_adj', 'value'), dominant='right', output='final', verbose=verbose)
            #-Note: Adjust Quantity has not been implemented. See NBERWTF constructor -#

        #-Filter Data-#
        idx = [u'year', u'exporter', u'importer', u'sitc4']         #Note: This collapses duplicate entries with unit differences (collapse_valuesonly())
        df = df.loc[:,idx + ['value']]

        #-Raw Trade Data Option with Added IISO3C and EISO3C-#
        if harmonised_raw and data_type == "trade":
            df = df.groupby(idx).sum().reset_index()                              #Sum Over Quantity Disaggregations
            #-Add EISO3C and IISO3C-#
            df['eiso3c'] = df['exporter'].apply(lambda x: countryname_to_iso3c[x])
            df['iiso3c'] = df['importer'].apply(lambda x: countryname_to_iso3c[x])
            return df
        if harmonised_raw and data_type in {"export", "import"}:
            warnings.warn("Cannot run harmonised_raw over export and import data as raw data is trade data")
            return None

        #-Collapse to SITC Level -#
        if level != 4:
            if verbose: print "[INFO] Collapsing to SITC Level %s Data" % level
            df['sitc%s'%level] = df.sitc4.apply(lambda x: x[0:level])
            df = df.groupby(['year', 'exporter', 'importer', 'sitc%s'%level]).sum()['value'].reset_index()
        elif level == 4:
            if verbose: print "[INFO] Data is already at the requested level"
        else:
            raise ValueError("Level must be 1, 2, 3, or 4 for the NBER data")

        #-Operations Post Collapse to SITC Level-#
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        #-Countries Only Adjustment-#
        if verbose: print "[INFO] Removing 'World' values so that the dataset is country only data"
        df = df.loc[(df.exporter != "World") & (df.importer != "World")]

        #-Add Country ISO Information-#
        #-Exports (can include NES on importer side)-#
        if data_type == 'export' or data_type == 'exports':
            if verbose: print "[INFO] Adding eiso3c using nber meta data"
            df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.eiso3c != '.')]
            df = df.groupby(['year', 'eiso3c', 'sitc%s'%level]).sum()['value'].reset_index()
        #-Imports (can include NES on importer side)-#
        elif data_type == 'import' or data_type == 'imports':
            if verbose: print "[INFO] Adding iiso3c using nber meta data"
            df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.iiso3c != '.')]
            df = df.groupby(['year','iiso3c', 'sitc%s'%level]).sum()['value'].reset_index()
        #-Trade-#
        else: 
            if verbose: print "[INFO] Adding eiso3c and iiso3c using nber meta data"
            df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
            df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
            df = df.loc[(df.iiso3c != '.') & (df.eiso3c != '.')]
            df = df.groupby(['year', 'eiso3c', 'iiso3c', 'sitc%s'%level]).sum()['value'].reset_index()
        
        #-Remove Product Code Errors in Dataset-#
        df = df.loc[(df['sitc%s'%level] != "")]                                                                   #Does this need a reset_index?
        
        #-productcodes-#
        if intertemp_productcode[0]:
            if level == 1:
                intertemp_productcode = (False, intertemp_productcode[1])
            else:
                AX = True
                dropAX = True               #Small Impact Post 1984 (Levels < 4 Include 'A' and 'X' values due to the collapse)
                sitcr2 = True               #Encode SITCR2 for Parsing
                drop_nonsitcr2 = False

        #-AX-#
        if AX:
            if verbose: print "[INFO] Adding Indicator Codes of 'A' and 'X'"
            df['AX'] = df['sitc%s'%level].apply(lambda x: 1 if re.search("[AX]", x) else 0)
            #-dropAX-#
            if dropAX:
                if verbose: print "[INFO] Dropping SITC Codes with 'A' or 'X'"
                df = df.loc[df.AX != 1]
                del df['AX']
            if not dropAX and values_only:
                del df['AX']
        
        #-Intertemporal ProductCodes-#
        if intertemp_productcode[0]:
            if verbose: print "[INFO] Computing Intertemporally Consistent ProductCodes ..."
            #-This Method relies on meta data computed by pyeconlab nberwtf constructor-#
            IC = intertemp_productcode[1]               #Dict("drop" and "collapse" code lists)
            #-Drop Codes-#
            drop_codes = IC["drop"]
            if verbose: 
                print "Dropping the following productcodes ..."
                print drop_codes
            keep_codes = set(df['sitc%s'%level].unique()).difference(set(drop_codes))
            df = df.loc[df["sitc%s"%level].isin(keep_codes)].copy(deep=True)
            #-Collapse Codes-#
            collapse_codes = IC["collapse"]
            if verbose:
                print "Collapsing the following productcodes ..."
                print collapse_codes
            collapse_codes = {x[0:level-1] for x in collapse_codes}     #-Simplify Computations-#
            for code in collapse_codes:
                df["sitc%s"%level] = df["sitc%s"%level].apply(lambda x: code if x[0:level-1] == code else x)
            #-Recodes-#
            recodes = IC["recode"]
            recode_codes = set(recodes.keys())
            if verbose: 
                print "Recoding the following productcodes ..."
                print recode_codes
            for code in recode_codes:
                df["sitc%s"%level] = df["sitc%s"%level].apply(lambda x: recodes[x] if x in recode_codes else x)
            df = df.groupby(list(df.columns.drop("value"))).sum()
            df = df.reset_index()

        #-Official SITCR2 Codes-#
        if sitcr2:
            if verbose: print "[INFO] Adding SITCR2 Indicator"
            sitc = SITC(revision=2, source_institution=source_institution)
            codes = sitc.get_codes(level=level)
            df['sitcr2'] = df['sitc%s'%level].apply(lambda x: 1 if x in codes else 0)
            if drop_nonsitcr2:
                if verbose: print "[INFO] Dropping Non Standard SITCR2 Codes"
                df = df.loc[(df.sitcr2 == 1)]
                del df['sitcr2']                #No Longer Needed
            if not drop_nonsitcr2 and values_only:
                del df['sitcr2']

        #-Adjust Country Codes to be Intertemporally Consistent-#
        if intertemp_cntrycode:
            #-Export-#
            if data_type == 'export' or data_type == 'exports':
                if verbose: print "[INFO] Imposing dynamically consistent eiso3c recodes across 1962-2000"
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['eiso3c'] != '.']
                df = df.groupby(['year', 'eiso3c', 'sitc%s'%level]).sum().reset_index()
            #-Import-#
            elif data_type == 'import' or data_type == 'imports':
                if verbose: print "[INFO] Imposing dynamically consistent iiso3c recodes across 1962-2000"
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df.groupby(['year', 'iiso3c', 'sitc%s'%level]).sum().reset_index()
            #-Trade-#
            else:
                if verbose: print "[INFO] Imposing dynamically consistent iiso3c and eiso3c recodes across 1962-2000"
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(iso3c_recodes_for_1962_2000, x, issue_error=False))    #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df[df['eiso3c'] != '.']
                df = df.groupby(['year', 'eiso3c', 'iiso3c', 'sitc%s'%level]).sum().reset_index()
        
        #-Drop Incomplete Country Codes-#
        if drop_incp_cntrycode:
            if verbose: print "[INFO] Dropping countries with incomplete data across 1962-2000"
            #-Export-#
            if data_type == 'export' or data_type == 'exports':
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['eiso3c'] != '.']
            #-Import-#
            elif data_type == 'import' or data_type == 'imports':
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
            #-Trade-#
            else:
                df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(incomplete_iso3c_for_1962_2000, x, issue_error=False))     #issue_error = false returns x if no match
                df = df[df['iiso3c'] != '.']
                df = df[df['eiso3c'] != '.']
            df = df.reset_index()
            del df['index']
       
        #-Adjust Units from 1000's to $'s-#
        if adjust_units:
            if verbose: print "[INFO] Adjusting 'value' units to $'s"
            df['value'] = df['value']*1000         #Default: Keep in 1000's
        
        #-Return Dataset-#
        if verbose: print "[INFO] Finished Computing Dataset (%s) ..." % (data_type) 
        return df

예제 #7

파일 보기

파일: constructor_dataset_sitc.py 프로젝트: davidheart/econ-phdthesis

def construct_sitc(data, data_classification, data_type, level, revision, check_concordance=True, adjust_units=False, concordance_institution="un", multiindex=False, verbose=True):
    """
    A Self Contained Function for Producing SITC Datasets from the BACI Dataset
    **Note:** Self Contained methods reduce the Need to Debug other routines and methods.
    The other constructor methods are however useful to diagnose issues and to understand properties of the dataset

    Parameters
    ----------
    data            :   pd.DataFrame 
                        Pandas DataFrame that contains RAW BACI HS data
    data_type       :   str
                        Specify data type: 'trade', 'export', 'import'
                        'export' will include values from a country to any region (including NES, and Nuetral Zone etc.)
                        'import' will include values to a country from any region (including NES, and Nuetral Zone etc.)
    level           :   int 
                        Specify SITC Chapter Level (1,2,3,4 or 5)
    revision        :   int 
                        Specify which SITC Revision to use (Revision 1,2,3, or 4)
    check_concordance : bool, optional(default=True)
                        Check Concordance for Full Matches to ensure no orphaned data
    adjust_units    :   bool, optional(default=False)
                        Adjust units to $'s from 1000 of $'s. Default is to keep the base dataset values in 1000's
    concordance_institution     :   str, optional(default="un")
                                    Specify which institution to use for product concordance information
    multiindex      :   bool, optional(default=False)
                        Return dataset with a multi-index object

    Notes
    -----
    1. Will need to consider the source BACI dataset HS96 etc. to fetch the correct Concordance
    2. When joining with other datasets, global merge attributes should be considered externally to this method
    3. Currently this excludes Quantity Information

    """

    #-Helper Functions-#
    def merge_iso3c_and_replace_iso3n(data, cntry_data, column):
        " Merge ISO3C and Replace match on column (i.e. eiso3n)"
        data = data.merge(cntry_data, how='left', left_on=[column], right_on=['iso3n'])
        del data['iso3n']
        del data[column]
        data.rename(columns={'iso3c' : column[0:-1]+'c'}, inplace=True)
        return data

    def dropna_iso3c(data, column):
        " Drop iiso3c or eiso3c isnull() values "
        if column == 'iiso3c':
            data.drop(data.loc[(data.iiso3c.isnull())].index, inplace=True)
        elif column == 'eiso3c':
            data.drop(data.loc[(data.eiso3c.isnull())].index, inplace=True)
        return data

    def check_concordance_helper(data, level):
        check = data.loc[data['sitc%s'%level].isnull()]
        if len(check) > 0:
            raise ValueError("Concordance doesn't provide match for the following products: %s" % (check.hs6.unique()))

    #-Obtain Key Index Variables-#
    data.rename(columns={'t' : 'year', 'i' : 'eiso3n', 'j' : 'iiso3n', 'v' : 'value', 'q': 'quantity'}, inplace=True)   #'hs6' is unchanged
    #-Exclude Quantity-#
    del data['quantity']
    #-Import Country Codes to ISO3C-#
    from pyeconlab.trade.dataset.CEPIIBACI.meta import hs96_iso3n_to_iso3c          #This doesn't include np.nan - is this going to be an issue?
    hs96_iso3n_to_iso3c = {int(k):v for k,v in hs96_iso3n_to_iso3c.items()}
    cntry_data = pd.Series(hs96_iso3n_to_iso3c).to_frame().reset_index()
    cntry_data.columns = ['iso3n', 'iso3c']
    cntry_data = cntry_data.sort(columns=['iso3n'])
    #-Import Product Concordance-#
    from pyeconlab.trade.concordance import HS_To_SITC
    concordance = HS_To_SITC(hs=data_classification, sitc="SITCR%s"%revision, hs_level=6, sitc_level=level, source_institution=concordance_institution, verbose=verbose).concordance
    #-Add Special Cases to the concordance-#
    
    # from .base import BACI
    # for k,v in BACI.adjust_hs6_to_sitc[data_classification].items():  #This Needs a Level Consideration
    #     concordance[k] = v
    
    #-Parse Options-#
    #-Change Value Units-#
    if adjust_units:
        data['value'] = data['value']*1000                     
    #-Collapse Trade Data based on data option-#
    if data_type == "trade":
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='eiso3n')
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='iiso3n')
        print "[WARNING] Dropping Countries where iso3c has null() values will remove country export/import from NES, and other regions!"
        data = dropna_iso3c(data, column='eiso3c')
        data = dropna_iso3c(data, column='iiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s'%level] = data['hs6'].apply(lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'eiso3c', 'iiso3c', 'sitc%s'%level]).sum()
        print "[Returning] BACI HS96 Source => TRADE data for SITCR%sL%s with ISO3C Countries" % (revision, level)
    elif data_type == "export" or data_type == "exports":
        #-Export Level-#
        del data['iiso3n']
        data = data.groupby(['year', 'eiso3n', 'hs6']).sum().reset_index()
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='eiso3n')
        data = dropna_iso3c(data, column='eiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s'%level] = data['hs6'].apply(lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'eiso3c', 'sitc%s'%level]).sum()
        print "[Returning] BACI HS96 Source => EXPORT data for SITCR%sL%s with ISO3C Countries" % (revision, level)
    elif data_type == "import" or data_type == "imports":
        #-Import Level-#
        del data['eiso3n']
        data = data.groupby(['year', 'iiso3n', 'hs6']).sum().reset_index()
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='iiso3n')
        data = dropna_iso3c(data, column='iiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s'%level] = data['hs6'].apply(lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'iiso3c', 'sitc%s'%level]).sum()
        print "[Returning] BACI HS96 Source => IMPORT data for SITCR%sL%s with ISO3C Countries" % (revision, level)
    else:
        raise ValueError("'data' must be 'trade', 'export', or 'import'")
    #-Data-#
    if not multiindex:
        data = data.reset_index()
    return data

예제 #8

파일 보기

파일: constructor_dataset_sitc.py 프로젝트: amrscs/ps

def construct_sitc(data,
                   data_classification,
                   data_type,
                   level,
                   revision,
                   check_concordance=True,
                   adjust_units=False,
                   concordance_institution="un",
                   multiindex=False,
                   verbose=True):
    """
    A Self Contained Function for Producing SITC Datasets from the BACI Dataset
    **Note:** Self Contained methods reduce the Need to Debug other routines and methods.
    The other constructor methods are however useful to diagnose issues and to understand properties of the dataset

    Parameters
    ----------
    data            :   pd.DataFrame 
                        Pandas DataFrame that contains RAW BACI HS data
    data_type       :   str
                        Specify data type: 'trade', 'export', 'import'
                        'export' will include values from a country to any region (including NES, and Nuetral Zone etc.)
                        'import' will include values to a country from any region (including NES, and Nuetral Zone etc.)
    level           :   int 
                        Specify SITC Chapter Level (1,2,3,4 or 5)
    revision        :   int 
                        Specify which SITC Revision to use (Revision 1,2,3, or 4)
    check_concordance : bool, optional(default=True)
                        Check Concordance for Full Matches to ensure no orphaned data
    adjust_units    :   bool, optional(default=False)
                        Adjust units to $'s from 1000 of $'s. Default is to keep the base dataset values in 1000's
    concordance_institution     :   str, optional(default="un")
                                    Specify which institution to use for product concordance information
    multiindex      :   bool, optional(default=False)
                        Return dataset with a multi-index object

    Notes
    -----
    1. Will need to consider the source BACI dataset HS96 etc. to fetch the correct Concordance
    2. When joining with other datasets, global merge attributes should be considered externally to this method
    3. Currently this excludes Quantity Information

    """

    #-Helper Functions-#
    def merge_iso3c_and_replace_iso3n(data, cntry_data, column):
        " Merge ISO3C and Replace match on column (i.e. eiso3n)"
        data = data.merge(cntry_data,
                          how='left',
                          left_on=[column],
                          right_on=['iso3n'])
        del data['iso3n']
        del data[column]
        data.rename(columns={'iso3c': column[0:-1] + 'c'}, inplace=True)
        return data

    def dropna_iso3c(data, column):
        " Drop iiso3c or eiso3c isnull() values "
        if column == 'iiso3c':
            data.drop(data.loc[(data.iiso3c.isnull())].index, inplace=True)
        elif column == 'eiso3c':
            data.drop(data.loc[(data.eiso3c.isnull())].index, inplace=True)
        return data

    def check_concordance_helper(data, level):
        check = data.loc[data['sitc%s' % level].isnull()]
        if len(check) > 0:
            raise ValueError(
                "Concordance doesn't provide match for the following products: %s"
                % (check.hs6.unique()))

    #-Obtain Key Index Variables-#
    data.rename(columns={
        't': 'year',
        'i': 'eiso3n',
        'j': 'iiso3n',
        'v': 'value',
        'q': 'quantity'
    },
                inplace=True)  #'hs6' is unchanged
    #-Exclude Quantity-#
    del data['quantity']
    #-Import Country Codes to ISO3C-#
    from pyeconlab.trade.dataset.CEPIIBACI.meta import hs96_iso3n_to_iso3c  #This doesn't include np.nan - is this going to be an issue?
    hs96_iso3n_to_iso3c = {int(k): v for k, v in hs96_iso3n_to_iso3c.items()}
    cntry_data = pd.Series(hs96_iso3n_to_iso3c).to_frame().reset_index()
    cntry_data.columns = ['iso3n', 'iso3c']
    cntry_data = cntry_data.sort(columns=['iso3n'])
    #-Import Product Concordance-#
    from pyeconlab.trade.concordance import HS_To_SITC
    concordance = HS_To_SITC(hs=data_classification,
                             sitc="SITCR%s" % revision,
                             hs_level=6,
                             sitc_level=level,
                             source_institution=concordance_institution,
                             verbose=verbose).concordance
    #-Add Special Cases to the concordance-#

    # from .base import BACI
    # for k,v in BACI.adjust_hs6_to_sitc[data_classification].items():  #This Needs a Level Consideration
    #     concordance[k] = v

    #-Parse Options-#
    #-Change Value Units-#
    if adjust_units:
        data['value'] = data['value'] * 1000
    #-Collapse Trade Data based on data option-#
    if data_type == "trade":
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='eiso3n')
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='iiso3n')
        print "[WARNING] Dropping Countries where iso3c has null() values will remove country export/import from NES, and other regions!"
        data = dropna_iso3c(data, column='eiso3c')
        data = dropna_iso3c(data, column='iiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s' % level] = data['hs6'].apply(
            lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'eiso3c', 'iiso3c',
                             'sitc%s' % level]).sum()
        print "[Returning] BACI HS96 Source => TRADE data for SITCR%sL%s with ISO3C Countries" % (
            revision, level)
    elif data_type == "export" or data_type == "exports":
        #-Export Level-#
        del data['iiso3n']
        data = data.groupby(['year', 'eiso3n', 'hs6']).sum().reset_index()
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='eiso3n')
        data = dropna_iso3c(data, column='eiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s' % level] = data['hs6'].apply(
            lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'eiso3c', 'sitc%s' % level]).sum()
        print "[Returning] BACI HS96 Source => EXPORT data for SITCR%sL%s with ISO3C Countries" % (
            revision, level)
    elif data_type == "import" or data_type == "imports":
        #-Import Level-#
        del data['eiso3n']
        data = data.groupby(['year', 'iiso3n', 'hs6']).sum().reset_index()
        #-Merge in ISO3C-#
        data = merge_iso3c_and_replace_iso3n(data, cntry_data, column='iiso3n')
        data = dropna_iso3c(data, column='iiso3c')
        #-Merge in SITCR2 Level 3-#
        data['sitc%s' % level] = data['hs6'].apply(
            lambda x: concord_data(concordance, x, issue_error=np.nan))
        if check_concordance:
            check_concordance_helper(data, level)
        del data['hs6']
        data = data.groupby(['year', 'iiso3c', 'sitc%s' % level]).sum()
        print "[Returning] BACI HS96 Source => IMPORT data for SITCR%sL%s with ISO3C Countries" % (
            revision, level)
    else:
        raise ValueError("'data' must be 'trade', 'export', or 'import'")
    #-Data-#
    if not multiindex:
        data = data.reset_index()
    return data

예제 #9

파일 보기

파일: constructor_dataset_sitcr2.py 프로젝트: amrscs/ps

def construct_sitcr2(df,
                     data_type,
                     level,
                     AX=True,
                     dropAX=True,
                     sitcr2=True,
                     drop_nonsitcr2=True,
                     adjust_hk=(False, None),
                     intertemp_productcode=(False, None),
                     intertemp_cntrycode=False,
                     drop_incp_cntrycode=False,
                     adjust_units=False,
                     source_institution='un',
                     harmonised_raw=False,
                     values_only=False,
                     verbose=True):
    """
        Construct a Self Contained (SC) Direct Action Dataset for Countries at the SITC Revision 2 Level 3
        
        There are no checks on the incoming dataframe to ensure data integrity.
        This is your responsibility

        STATUS: tests/test_constructor_dataset_sitcr2l3.py

        Parameters
        ----------
        df                  :   DataFrame
                                Pandas DataFrame containing the raw data
        data_type           :   str
                                Specify what type of data 'trade', 'export', 'import'
        level               :   int
                                Specify Level of Final dataset (i.e. SITC Level 1, 2, 3, or 4)
        AX                  :   bool, optional(default=True)
                                Add a Marker for Codes that Include 'A' and 'X'
        dropAX              :   bool, optional(default=True)
                                Drop AX Codes at the Relevant Level (i.e. SITC Level 3 Data will include appropriate A and X codes)
        sitcr2              :   bool, optional(default=True)
                                Add SITCR2 Indicator
        drop_nonsitcr2      :   bool, optional(default=True)
                                Drop non-standard SITC2 Codes
        adjust_hk           :   Tuple(bool, df), optional(default=(False, None))
                                Adjust the Hong Kong Data using NBER supplemental files which needs to be supplied as a dataframe
        intertemp_productcode : Tuple(bool, dict), optional(default=False, None)
                                Apply an Intertemporal Product Code System drop a conversion dictionary (IC["drop"] = [], IC["collapse"] = [])
                                Note this will override the drop_nonsitcr2 option
        intertemp_cntrycode :   bool, optional(default=False)
                                Generate Intertemporal Consistent Country Units (from meta)
        drop_incp_cntrycode :   bool, optional(default=False)
                                Drop Incomplete Country Codes (from meta)
        adjust_units        :   bool, optional(default=False)
                                Adjust units by a factor of 1000 to specify in $'s
        source_institution  :   str, optional(default='un')
                                which institutions SITC classification to use
        harmonised_raw      :   bool, optional(default=False)
                                Return simple RAW dataset with Quantity disaggregation collapsed and eiso3c and iiso3c columns (Note: You may use hk_adjust with this option)
        values_only         :   bool, optional(default=False)
                                Return Values and Relevant Index Data Only (i.e. drop 'AX', 'sitcr2')

        Notes
        -----
        1. Operations ::

            [1] Adjust Hong Kong and China Data
            [2] Drop SITC4 to SITC3 Level (for greater intertemporal consistency)
            [3] Import ISO3C Codes as Country Codes
            [4] Drop Errors in SITC3 codes ["" Codes]
            
            Optional:
            ---------
            [A] Drop sitc3 codes that contain 'A' and 'X' codes [Default: True]
            [B] Drop Non-Standard SITC3 Codes [i.e. Aren't in the Classification]
            [C] Construct an Intertemporal Product Code Classification and Adjust Dataset
            [C] Adjust iiso3c, eiso3c country codes to be intertemporally consistent
            [D] Drop countries with incomplete 'total' data across 1962 to 2000 (strict measure) [Identification Debatable]
  

        3. This makes use of countryname_to_iso3c in the meta data subpackage
        4. This method can be tested using /do/basic_sitc3_country_data.do
        5. DropAX + Drop NonStandard SITC Rev 2 Codes still contains ~94-96% of the data found in the raw data

        ..  Future Work
            -----------
            1. Check SITC Revision 2 Official Codes
            2. Add in a Year Filter

        """

    #-Operations Requiring RAW SITC Level 4-#
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    idx = [
        u'year', u'icode', u'importer', u'ecode', u'exporter', u'sitc4',
        u'unit', u'dot'
    ]

    #-Hong Kong China Data Adjustment Option-#
    if type(adjust_hk) == bool:
        adjust_hk = (adjust_hk, None)
    if adjust_hk[0]:
        if verbose: print "[INFO] Adjusting Hong Kong and China Values"
        hkdata = adjust_hk[1]
        #-Values-#
        raw_value = df[idx + ['value']].rename(columns={'value': 'value_raw'})
        try:
            adjust_value = hkdata[idx + ['value_adj']]
        except:
            raise ValueError(
                "[ERROR] China/Hong Kong Data has not been passed in properly!"
            )
        #-Note: Current merge_columns utility merges one column set at a time-#
        df = merge_columns(raw_value,
                           adjust_value,
                           idx,
                           collapse_columns=('value_raw', 'value_adj',
                                             'value'),
                           dominant='right',
                           output='final',
                           verbose=verbose)
        #-Note: Adjust Quantity has not been implemented. See NBERWTF constructor -#

    #-Filter Data-#
    idx = [
        u'year', u'exporter', u'importer', u'sitc4'
    ]  #Note: This collapses duplicate entries with unit differences (collapse_valuesonly())
    df = df.loc[:, idx + ['value']]

    #-Raw Trade Data Option with Added IISO3C and EISO3C-#
    if harmonised_raw and data_type == "trade":
        df = df.groupby(
            idx).sum().reset_index()  #Sum Over Quantity Disaggregations
        #-Add EISO3C and IISO3C-#
        df['eiso3c'] = df['exporter'].apply(lambda x: countryname_to_iso3c[x])
        df['iiso3c'] = df['importer'].apply(lambda x: countryname_to_iso3c[x])
        return df
    if harmonised_raw and data_type in {"export", "import"}:
        warnings.warn(
            "Cannot run harmonised_raw over export and import data as raw data is trade data"
        )
        return None

    #-Collapse to SITC Level -#
    if level != 4:
        if verbose: print "[INFO] Collapsing to SITC Level %s Data" % level
        df['sitc%s' % level] = df.sitc4.apply(lambda x: x[0:level])
        df = df.groupby(['year', 'exporter', 'importer',
                         'sitc%s' % level]).sum()['value'].reset_index()
    elif level == 4:
        if verbose: print "[INFO] Data is already at the requested level"
    else:
        raise ValueError("Level must be 1, 2, 3, or 4 for the NBER data")

    #-Operations Post Collapse to SITC Level-#
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    #-Countries Only Adjustment-#
    if verbose:
        print "[INFO] Removing 'World' values so that the dataset is country only data"
    df = df.loc[(df.exporter != "World") & (df.importer != "World")]

    #-Add Country ISO Information-#
    #-Exports (can include NES on importer side)-#
    if data_type == 'export' or data_type == 'exports':
        if verbose: print "[INFO] Adding eiso3c using nber meta data"
        df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
        df = df.loc[(df.eiso3c != '.')]
        df = df.groupby(['year', 'eiso3c',
                         'sitc%s' % level]).sum()['value'].reset_index()
    #-Imports (can include NES on importer side)-#
    elif data_type == 'import' or data_type == 'imports':
        if verbose: print "[INFO] Adding iiso3c using nber meta data"
        df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
        df = df.loc[(df.iiso3c != '.')]
        df = df.groupby(['year', 'iiso3c',
                         'sitc%s' % level]).sum()['value'].reset_index()
    #-Trade-#
    else:
        if verbose:
            print "[INFO] Adding eiso3c and iiso3c using nber meta data"
        df['iiso3c'] = df.importer.apply(lambda x: countryname_to_iso3c[x])
        df['eiso3c'] = df.exporter.apply(lambda x: countryname_to_iso3c[x])
        df = df.loc[(df.iiso3c != '.') & (df.eiso3c != '.')]
        df = df.groupby(['year', 'eiso3c', 'iiso3c',
                         'sitc%s' % level]).sum()['value'].reset_index()

    #-Remove Product Code Errors in Dataset-#
    df = df.loc[(df['sitc%s' % level] != "")]  #Does this need a reset_index?

    #-productcodes-#
    if intertemp_productcode[0]:
        if level == 1:
            intertemp_productcode = (False, intertemp_productcode[1])
        else:
            AX = True
            dropAX = True  #Small Impact Post 1984 (Levels < 4 Include 'A' and 'X' values due to the collapse)
            sitcr2 = True  #Encode SITCR2 for Parsing
            drop_nonsitcr2 = False

    #-AX-#
    if AX:
        if verbose: print "[INFO] Adding Indicator Codes of 'A' and 'X'"
        df['AX'] = df['sitc%s' %
                      level].apply(lambda x: 1 if re.search("[AX]", x) else 0)
        #-dropAX-#
        if dropAX:
            if verbose: print "[INFO] Dropping SITC Codes with 'A' or 'X'"
            df = df.loc[df.AX != 1]
            del df['AX']
        if not dropAX and values_only:
            del df['AX']

    #-Intertemporal ProductCodes-#
    if intertemp_productcode[0]:
        if verbose:
            print "[INFO] Computing Intertemporally Consistent ProductCodes ..."
        #-This Method relies on meta data computed by pyeconlab nberwtf constructor-#
        IC = intertemp_productcode[1]  #Dict("drop" and "collapse" code lists)
        #-Drop Codes-#
        drop_codes = IC["drop"]
        if verbose:
            print "Dropping the following productcodes ..."
            print drop_codes
        keep_codes = set(df['sitc%s' % level].unique()).difference(
            set(drop_codes))
        df = df.loc[df["sitc%s" % level].isin(keep_codes)].copy(deep=True)
        #-Collapse Codes-#
        collapse_codes = IC["collapse"]
        if verbose:
            print "Collapsing the following productcodes ..."
            print collapse_codes
        collapse_codes = {x[0:level - 1]
                          for x in collapse_codes}  #-Simplify Computations-#
        for code in collapse_codes:
            df["sitc%s" % level] = df["sitc%s" % level].apply(
                lambda x: code if x[0:level - 1] == code else x)
        #-Recodes-#
        recodes = IC["recode"]
        recode_codes = set(recodes.keys())
        if verbose:
            print "Recoding the following productcodes ..."
            print recode_codes
        for code in recode_codes:
            df["sitc%s" % level] = df["sitc%s" % level].apply(
                lambda x: recodes[x] if x in recode_codes else x)
        df = df.groupby(list(df.columns.drop("value"))).sum()
        df = df.reset_index()

    #-Official SITCR2 Codes-#
    if sitcr2:
        if verbose: print "[INFO] Adding SITCR2 Indicator"
        sitc = SITC(revision=2, source_institution=source_institution)
        codes = sitc.get_codes(level=level)
        df['sitcr2'] = df['sitc%s' %
                          level].apply(lambda x: 1 if x in codes else 0)
        if drop_nonsitcr2:
            if verbose: print "[INFO] Dropping Non Standard SITCR2 Codes"
            df = df.loc[(df.sitcr2 == 1)]
            del df['sitcr2']  #No Longer Needed
        if not drop_nonsitcr2 and values_only:
            del df['sitcr2']

    #-Adjust Country Codes to be Intertemporally Consistent-#
    if intertemp_cntrycode:
        #-Export-#
        if data_type == 'export' or data_type == 'exports':
            if verbose:
                print "[INFO] Imposing dynamically consistent eiso3c recodes across 1962-2000"
            df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(
                iso3c_recodes_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['eiso3c'] != '.']
            df = df.groupby(['year', 'eiso3c',
                             'sitc%s' % level]).sum().reset_index()
        #-Import-#
        elif data_type == 'import' or data_type == 'imports':
            if verbose:
                print "[INFO] Imposing dynamically consistent iiso3c recodes across 1962-2000"
            df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(
                iso3c_recodes_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['iiso3c'] != '.']
            df = df.groupby(['year', 'iiso3c',
                             'sitc%s' % level]).sum().reset_index()
        #-Trade-#
        else:
            if verbose:
                print "[INFO] Imposing dynamically consistent iiso3c and eiso3c recodes across 1962-2000"
            df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(
                iso3c_recodes_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(
                iso3c_recodes_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['iiso3c'] != '.']
            df = df[df['eiso3c'] != '.']
            df = df.groupby(['year', 'eiso3c', 'iiso3c',
                             'sitc%s' % level]).sum().reset_index()

    #-Drop Incomplete Country Codes-#
    if drop_incp_cntrycode:
        if verbose:
            print "[INFO] Dropping countries with incomplete data across 1962-2000"
        #-Export-#
        if data_type == 'export' or data_type == 'exports':
            df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(
                incomplete_iso3c_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['eiso3c'] != '.']
        #-Import-#
        elif data_type == 'import' or data_type == 'imports':
            df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(
                incomplete_iso3c_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['iiso3c'] != '.']
        #-Trade-#
        else:
            df['iiso3c'] = df['iiso3c'].apply(lambda x: concord_data(
                incomplete_iso3c_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df['eiso3c'] = df['eiso3c'].apply(lambda x: concord_data(
                incomplete_iso3c_for_1962_2000, x, issue_error=False
            ))  #issue_error = false returns x if no match
            df = df[df['iiso3c'] != '.']
            df = df[df['eiso3c'] != '.']
        df = df.reset_index()
        del df['index']

    #-Adjust Units from 1000's to $'s-#
    if adjust_units:
        if verbose: print "[INFO] Adjusting 'value' units to $'s"
        df['value'] = df['value'] * 1000  #Default: Keep in 1000's

    #-Return Dataset-#
    if verbose:
        print "[INFO] Finished Computing Dataset (%s) ..." % (data_type)
    return df