Пример #1
0
def new_create_dict(df):
    df_res = {}
    for index, row in df.iterrows():
            words = tokenizer.clean_text_tokenize(str(row['lead_paragraph_sw_stem']))
            for w in words:
                if w in df_res:
                    df_res[w] = df_res[w] + 1
                else:
                    df_res[w] = 1
    df_res = sorted(df_res.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    return df_res
Пример #2
0
def create_covid_dict(df):
    df_res = {}
    for index, row in df.iterrows():
        date = row['date']
        if df_res.get(date) is None:
            df_res[date] = 0
        count = df_res[date]
        words = tokenizer.clean_text_tokenize(str(row['lead_paragraph_sw_stem']))
        for w in words:
            if w in ['covid', 'corona', 'coronaviru']:
                count = count + 1
        df_res[date] = count
    return sorted(df_res.items(), key=lambda kv: (kv[0], kv[1]))
Пример #3
0
def create_monthly_dict(df):
    df_res = {}
    for index, row in df.iterrows():
        date = row['date']
        month = date.month
        if df_res.get(month) is None:
            df_res[month] = {}
        tf = df_res[month]
        words = tokenizer.clean_text_tokenize(str(row['lead_paragraph_sw_stem']))
        for w in words:
            if w in tf:
                tf[w] = tf[w] + 1
            else:
                tf[w] = 1
    for k in df_res.keys():
        tf = df_res[k]
        df_res[k] = sorted(tf.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)[:6]
    return df_res
Пример #4
0
def create_df_old(df, desks):
    df_res = pd.DataFrame(columns={"date": "", "tf": ""}, index=[0])
    for index, row in df.iterrows():
        if row['date'] in df_res['date'].values:
            tf_dict = df_res['tf']
        else:
            tf_dict = {}
            nr = {"date":row['date'], "tf":tf_dict}
            df_res = df_res.append(nr,ignore_index=True)
        if row['news_desk'] in desks:
            words = tokenizer.clean_text_tokenize(str(row['lead_paragraph_sw_stem']))
            for w in words:
                if w in tf_dict:
                    tf_dict[w] = tf_dict[w] + 1
                else:
                    tf_dict[w] = 1
        df_update = pd.DataFrame(columns={"date": "", "tf": ""})
        df_res.loc[df_res[df_res['date'] == row['date']].index, 'tf'] = pd.Series(tf_dict)

    return df_res