示例#1
0
def aggregate_preprocess_results(codes, dict_edits, dict_newcomers, dict_reverts):
    # df_topics, topics = load_topics(path_topics)
    aggs = []

    for code in codes:
        start = time.time()
        try:
            df_gb = process_edits(dict_edits, code)
            # group edits
            df_gb.rename({"title": "index", 'event_user_id': 'count', 'revision_text_bytes_diff': 'rev_len_sum'},
                         inplace=True, axis=1)
            final = df_gb.groupby(["date", "covid", "user_kind"]).sum().reset_index()

            final = process_newcomers(dict_newcomers, code, final)
            final = process_reverts(dict_reverts, code, final)

            final = final.fillna(0)
            final["code"] = code
            aggs.append(final.loc[:, final.columns != 'index'])
        except Exception as e:
            traceback.print_exc()
            Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')
        Logger.instance('pipeline').info(f'Processing {code} took {time.time() - start}')
    final_aggs = pd.concat(aggs)

    return final_aggs
示例#2
0
import json
import namedtupled
import os

from helpers.logger import Logger

logger = Logger.instance()

CONFIGURATION_FILENAME = 'conf.json'


def filename_to_named_tuple(filename):
    with open(filename) as data_file:
        c_ = json.load(data_file)
        # pprint(c_)
        return namedtupled.map(c_)


def load_constants():
    c_ = None
    try:
        c_ = filename_to_named_tuple(CONFIGURATION_FILENAME)
    except FileNotFoundError as e:
        try:
            c_ = filename_to_named_tuple(
                os.path.join('..', CONFIGURATION_FILENAME))
        except FileNotFoundError as e:
            try:
                c_ = filename_to_named_tuple(
                    os.path.join('..', '..', CONFIGURATION_FILENAME))
            except: