Exemplo n.º 1
0
        # Put into a dataframe:
        df = pd.DataFrame(data=contents[1:])
        df.columns = contents[0]

        # Reindex using `docid`;
        # This lets us gauge posting dates even if that's missing for a doc:
        dfindex = []
        for doc in df.docid:
            dfindex.append(int(re.sub('[a-z]*$', '', doc)))

        df.index = dfindex
        df.sort_index(inplace=True)

        if file == '43-heller.csv':
            df = Config.fix_heller_dates(df)

        # Subset to 2013-2016:
        df['form_date'] = pd.to_datetime(df['form_date'], yearfirst=True)
        validdates = df['form_date'] >= pd.to_datetime('2013-01-03',
                                                       yearfirst=True)
        lastvalid = max(validdates.index.where(validdates == True))

        df = df.loc[:lastvalid, :]

        # Drop empty docs:
        notempty = df['clean_text'] != ''
        df = df.loc[notempty, :]

        n_docs = len(df)