def main():
    """

    """
    ## Parse Command Line Arguments
    args = parse_command_line()
    ## Load Settings
    settings = load_settings()
    ## Update Max Documents Parameter Based On Command Line
    MODEL._vocabulary._max_docs = args.max_docs
    ## Create Coordinate Grid
    if not args.known_coordinates:
        coordinates = MODEL._create_coordinate_grid(args.grid_cell_size)
    else:
        coordinates = load_known_coordinates(settings)
    ## Load User List
    user_data_paths = load_users()
    ## Format Data Paths into Chunks
    if args.chunksize:
        user_data_chunks = list(chunks(user_data_paths, args.chunksize))
    else:
        user_data_chunks = [user_data_paths]
    ## Process Data in Chunks
    all_preds = []
    for d, data_chunk in enumerate(user_data_chunks):
        ## Update User
        LOGGER.info("[Processing User Data Chunk {}/{}]".format(
            d + 1, len(user_data_chunks)))
        ## Prepare User Data
        data_chunk, X, n = prepare_data(data_chunk)
        ## Make Predictions
        LOGGER.info("Making Inferences")
        _, P = MODEL.predict_proba(X, coordinates)
        y_pred = pd.DataFrame(index=data_chunk,
                              data=coordinates[P.argmax(axis=1)],
                              columns=["longitude_argmax", "latitude_argmax"])
        ## Reverse Geocoding
        if args.reverse_geocode:
            LOGGER.info("Reversing the Geolocation Inferences")
            reverse = reverse_search(
                y_pred[["longitude_argmax", "latitude_argmax"]].values)
            for level, level_name in zip(
                ["name", "admin2", "admin1", "cc"],
                ["city", "county", "state", "country"]):
                level_data = [i[level] for i in reverse]
                y_pred[f"{level_name}_argmax"] = level_data
        ## Add Posterior
        if args.posterior:
            P = pd.DataFrame(P,
                             index=data_chunk,
                             columns=list(map(tuple, coordinates)))
            y_pred = pd.merge(y_pred, P, left_index=True, right_index=True)
        all_preds.append(y_pred)
    ## Cache
    all_preds = pd.concat(all_preds)
    LOGGER.info("Caching Inferences")
    all_preds.to_csv(args.output_csv, index=True)
    ## Done
    LOGGER.info("Script complete.")
        (pre_covid_num_matches_gender[col] == 0)] = np.nan

#######################
### Representative Posts
#######################

LOGGER.info("Identifying Representative Examples")

## Post Cache
rep_cache_file = f"{CACHE_DIR}representative_examples.json"

## Load Representatives
if not os.path.exists(rep_cache_file) or RERUN:
    ## Find Representative Posts
    representative_examples = []
    keyword_chunks = list(chunks(keyword_counts.columns.tolist(), 40))
    for keyword_chunk in tqdm(keyword_chunks,
                              position=0,
                              desc="Keyword Chunk",
                              file=sys.stdout):
        keyword_examples = find_keyword_examples(filenames,
                                                 keyword_chunk,
                                                 n=100,
                                                 indorgs=INDORGS,
                                                 genders=GENDERS,
                                                 locations=LOCATIONS)
        representative_examples.append(keyword_examples)
    representative_examples = pd.concat(representative_examples).reset_index(
        drop=True)
    ## Cache
    with open(rep_cache_file, "w") as the_file:
Пример #3
0
def predict_and_interpret(filenames,
                          model,
                          min_date=None,
                          max_date=None,
                          n_samples=None,
                          randomized=False,
                          interpret=False,
                          bootstrap_samples=100,
                          bootstrap_sample_percent=30,
                          ignore_missing=True,
                          chunksize=None):
    """

    """
    ## Date Boundaries
    if min_date is not None and isinstance(min_date, str):
        min_date = pd.to_datetime(min_date)
    if max_date is not None and isinstance(max_date, str):
        max_date = pd.to_datetime(max_date)
    ## Get Chunks
    if chunksize is None:
        chunksize = len(filenames)
    filechunks = list(chunks(filenames, chunksize))
    ## Initialize Cache
    X_test = []
    support = []
    y_pred = {}
    n = {}
    tn = {}
    tn_binary = {}
    filtered_filenames = []
    ## Cycle Through Chunks
    for j, file_chunk in enumerate(filechunks):
        LOGGER.info("[Beginning to Process File Chunk {}/{}]".format(
            j + 1, len(filechunks)))
        ## Vectorize the data
        LOGGER.info("Vectorizing Test Files")
        chunk_files, X_chunk, _, n_ = model._load_vectors(
            file_chunk,
            None,
            min_date=min_date,
            max_date=max_date,
            n_samples=n_samples,
            randomized=randomized,
            return_post_counts=True)
        ## Ignore Users without any features
        if ignore_missing:
            LOGGER.info("Filtering Out Users Without Any Recognized Terms")
            missing_mask = np.nonzero(X_chunk.sum(axis=1) > 0)[0]
            chunk_files = [chunk_files[m] for m in missing_mask]
            X_chunk = X_chunk[missing_mask]
            n_ = n_[missing_mask]
        n_ = dict((zip(chunk_files, n_)))
        ## Count Tokens
        tn_ = dict(
            (filename, count)
            for filename, count in zip(chunk_files, X_chunk.sum(axis=1)))
        tn_binary_ = dict(
            (filename, count)
            for filename, count in zip(chunk_files, (X_chunk > 0).sum(axis=1)))
        ## Apply Any Additional Preprocessing
        LOGGER.info("Generating Feature Set")
        X_chunk = model.preprocessor.transform(X_chunk)
        ## Feed Forward
        LOGGER.info("Computing Logits")
        support_ = np.multiply(X_chunk, model.model.coef_)
        logits = support_.sum(axis=1) + model.model.intercept_
        ## Get Predictions
        LOGGER.info("Computing Probabilities")
        p = dict(zip(chunk_files, 1 / (1 + np.exp(-logits))))
        ## Cache Results
        if interpret:
            X_test.append(X_chunk)
            support.append(support_)
        y_pred.update(p)
        n.update(n_)
        tn.update(tn_)
        tn_binary.update(tn_binary_)
        filtered_filenames.extend(chunk_files)
    ## Format Cache
    n = np.array([n[filename] for filename in filtered_filenames])
    tn = np.array([tn[filename] for filename in filtered_filenames])
    tn_binary = np.array(
        [tn_binary[filename] for filename in filtered_filenames])
    ## Interpretation
    feature_range = None
    if interpret:
        ## Concatenate Features and Support
        if isinstance(X_test[0], csr_matrix):
            X_test = vstack(X_test).toarray()
        else:
            X_test = np.vstack(X_test)
        if isinstance(support[0], csr_matrix):
            support = vstack(support)
        else:
            support = np.vstack(support)
        ## Get Features
        feature_names = model.get_feature_names()
        ## Get Feature Range (Bootstrap used for Confidence Intervals)
        sample_size = int(X_test.shape[0] * bootstrap_sample_percent / 100)
        feature_range = []
        for _ in tqdm(list(range(bootstrap_samples)),
                      desc="Bootstrap Feature Samples",
                      file=sys.stdout):
            sind = np.random.choice(X_test.shape[0],
                                    size=sample_size,
                                    replace=True)
            feature_range.append(support[sind].mean(axis=0))
        feature_range = np.percentile(np.vstack(feature_range),
                                      [2.5, 50, 97.5],
                                      axis=0)
        feature_range = pd.DataFrame(feature_range.T,
                                     index=feature_names,
                                     columns=["lower", "median", "upper"])
    return y_pred, feature_range, n, tn, tn_binary
Пример #4
0
def learn_vocabulary(filenames,
                     chunksize=100,
                     prune_rate=1,
                     min_prune_freq=1,
                     min_date=None,
                     max_date=None,
                     min_n=1,
                     max_n=1,
                     remove_retweets=True,
                     binarize=False,
                     jobs=4,
                     pretokenized=False):
    """
    Args:
        filenames (list of str): Raw data filenames
        chunksize (int): How many files to process in parallel before aggregating
        prune_rate (int): How many chunks to process before pruning vocabulary
        min_prune_freq (int): N-grams below this threshold at each pruning are removed
    """
    ## Storage
    vocab = Counter()
    agg_chunk_counts = Counter()
    ## Initialize Multiprocessor
    mp = Pool(jobs)
    ## Initialize Tokenize/Count Function
    counter = partial(tokenize_and_count,
                      min_date=min_date,
                      max_date=max_date,
                      min_n=min_n,
                      max_n=max_n,
                      remove_retweets=remove_retweets,
                      pretokenized=pretokenized)
    ## Process Data
    filechunks = list(chunks(filenames, chunksize))
    chunks_processed = 0
    for chunk in tqdm(filechunks,
                      desc="Learning Vocabulary",
                      file=sys.stdout,
                      position=0):
        ## Increment Chunk Count
        chunks_processed += 1
        ## Apply Counter
        chunk_counts = list(
            tqdm(mp.imap_unordered(counter, chunk),
                 total=len(chunk),
                 desc="File",
                 file=sys.stdout,
                 position=1,
                 leave=False))
        ## Aggregate Counts
        for cc in chunk_counts:
            if binarize:
                cc = Counter({i: 1 for i in cc.keys()})
            agg_chunk_counts += cc
        ## Update Cache
        if chunks_processed == prune_rate:
            ## Apply Pruning
            agg_chunk_counts = Counter({
                x: y
                for x, y in agg_chunk_counts.items() if y >= min_prune_freq
            })
            ## Add to General Vocab
            vocab += agg_chunk_counts
            ## Reset Chunk Counter
            agg_chunk_counts = Counter()
            chunks_processed = 0
    ## Apply Final Update
    if len(agg_chunk_counts) > 0:
        ## Prune Filter
        agg_chunk_counts = Counter(
            {x: y
             for x, y in agg_chunk_counts.items() if y >= min_prune_freq})
        ## Add to General Vocab
        vocab += agg_chunk_counts
    ## Close Pool
    _ = mp.close()
    ## Initialize Vectorizer using Learned Vocabulary
    global cvec
    cvec = initialize_vectorizer(vocab)
    return vocab
Пример #5
0
    date_range = [start_date, end_date]
else:
    date_range = [pd.to_datetime(start_date)]
    while date_range[-1] < pd.to_datetime(end_date):
        date_range.append(
            min(date_range[-1] + timedelta(freq), pd.to_datetime(end_date)))
    date_range = [i.date().isoformat() for i in date_range]

## Cache Directory/Plot Directory
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

## Get Comment Counts over Date Range
author_chunks = list(chunks(sorted(counts.keys()), n=100))
for dstart, dstop in tqdm(zip(date_range[:-1], date_range[1:]),
                          total=len(date_range) - 1,
                          position=0,
                          leave=False,
                          file=sys.stdout,
                          desc="Date Range"):
    for a, author_chunk in tqdm(enumerate(author_chunks),
                                total=len(author_chunks),
                                position=1,
                                leave=False,
                                file=sys.stdout,
                                desc="Author Chunk"):
        ## Check Cache
        cache_file = f"{cache_dir}{dstart}_{dstop}_chunk-{a}.json.gz"
        if os.path.exists(cache_file):
            vocab.get_ngrams(fd["text_tokenized"], NGRAMS[0], NGRAMS[1]))
    counts_vec = vocab.dvec.transform(counts).toarray()
    return counts_vec


## Count Cache File
X_cache_file = "{}X_ngram{}-{}_{}-{}_{}.joblib".format(CACHE_DIR, NGRAMS[0],
                                                       NGRAMS[1], DATE_START,
                                                       DATE_END, CACHE_FREQ)

## Count/Load Language Usage
if not os.path.exists(X_cache_file) or RERUN_COUNT:
    ## Count Usage Over Time (In Chunks)
    LOGGER.info("Count Language Usage")
    X = np.zeros((len(date_range) - 1, len(ngrams)))
    processed_file_chunks = list(chunks(processed_files, CHUNKSIZE))
    mp = Pool(NUM_PROCESSES)
    for fchunk in tqdm(processed_file_chunks,
                       desc="File Chunk",
                       file=sys.stdout,
                       total=len(processed_file_chunks)):
        fchunk_x = list(mp.imap_unordered(count_language_usage, fchunk))
        fchunk_x = np.stack(fchunk_x).sum(axis=0)
        X += fchunk_x
    _ = mp.close()
    ## Cache
    _ = joblib.dump(X, X_cache_file)
else:
    ## Load From Cache
    LOGGER.info("Loading Cached Language Usage")
    X = joblib.load(X_cache_file)