def main(): """ """ ## Parse Command Line Arguments args = parse_command_line() ## Load Settings settings = load_settings() ## Update Max Documents Parameter Based On Command Line MODEL._vocabulary._max_docs = args.max_docs ## Create Coordinate Grid if not args.known_coordinates: coordinates = MODEL._create_coordinate_grid(args.grid_cell_size) else: coordinates = load_known_coordinates(settings) ## Load User List user_data_paths = load_users() ## Format Data Paths into Chunks if args.chunksize: user_data_chunks = list(chunks(user_data_paths, args.chunksize)) else: user_data_chunks = [user_data_paths] ## Process Data in Chunks all_preds = [] for d, data_chunk in enumerate(user_data_chunks): ## Update User LOGGER.info("[Processing User Data Chunk {}/{}]".format( d + 1, len(user_data_chunks))) ## Prepare User Data data_chunk, X, n = prepare_data(data_chunk) ## Make Predictions LOGGER.info("Making Inferences") _, P = MODEL.predict_proba(X, coordinates) y_pred = pd.DataFrame(index=data_chunk, data=coordinates[P.argmax(axis=1)], columns=["longitude_argmax", "latitude_argmax"]) ## Reverse Geocoding if args.reverse_geocode: LOGGER.info("Reversing the Geolocation Inferences") reverse = reverse_search( y_pred[["longitude_argmax", "latitude_argmax"]].values) for level, level_name in zip( ["name", "admin2", "admin1", "cc"], ["city", "county", "state", "country"]): level_data = [i[level] for i in reverse] y_pred[f"{level_name}_argmax"] = level_data ## Add Posterior if args.posterior: P = pd.DataFrame(P, index=data_chunk, columns=list(map(tuple, coordinates))) y_pred = pd.merge(y_pred, P, left_index=True, right_index=True) all_preds.append(y_pred) ## Cache all_preds = pd.concat(all_preds) LOGGER.info("Caching Inferences") all_preds.to_csv(args.output_csv, index=True) ## Done LOGGER.info("Script complete.")
(pre_covid_num_matches_gender[col] == 0)] = np.nan ####################### ### Representative Posts ####################### LOGGER.info("Identifying Representative Examples") ## Post Cache rep_cache_file = f"{CACHE_DIR}representative_examples.json" ## Load Representatives if not os.path.exists(rep_cache_file) or RERUN: ## Find Representative Posts representative_examples = [] keyword_chunks = list(chunks(keyword_counts.columns.tolist(), 40)) for keyword_chunk in tqdm(keyword_chunks, position=0, desc="Keyword Chunk", file=sys.stdout): keyword_examples = find_keyword_examples(filenames, keyword_chunk, n=100, indorgs=INDORGS, genders=GENDERS, locations=LOCATIONS) representative_examples.append(keyword_examples) representative_examples = pd.concat(representative_examples).reset_index( drop=True) ## Cache with open(rep_cache_file, "w") as the_file:
def predict_and_interpret(filenames, model, min_date=None, max_date=None, n_samples=None, randomized=False, interpret=False, bootstrap_samples=100, bootstrap_sample_percent=30, ignore_missing=True, chunksize=None): """ """ ## Date Boundaries if min_date is not None and isinstance(min_date, str): min_date = pd.to_datetime(min_date) if max_date is not None and isinstance(max_date, str): max_date = pd.to_datetime(max_date) ## Get Chunks if chunksize is None: chunksize = len(filenames) filechunks = list(chunks(filenames, chunksize)) ## Initialize Cache X_test = [] support = [] y_pred = {} n = {} tn = {} tn_binary = {} filtered_filenames = [] ## Cycle Through Chunks for j, file_chunk in enumerate(filechunks): LOGGER.info("[Beginning to Process File Chunk {}/{}]".format( j + 1, len(filechunks))) ## Vectorize the data LOGGER.info("Vectorizing Test Files") chunk_files, X_chunk, _, n_ = model._load_vectors( file_chunk, None, min_date=min_date, max_date=max_date, n_samples=n_samples, randomized=randomized, return_post_counts=True) ## Ignore Users without any features if ignore_missing: LOGGER.info("Filtering Out Users Without Any Recognized Terms") missing_mask = np.nonzero(X_chunk.sum(axis=1) > 0)[0] chunk_files = [chunk_files[m] for m in missing_mask] X_chunk = X_chunk[missing_mask] n_ = n_[missing_mask] n_ = dict((zip(chunk_files, n_))) ## Count Tokens tn_ = dict( (filename, count) for filename, count in zip(chunk_files, X_chunk.sum(axis=1))) tn_binary_ = dict( (filename, count) for filename, count in zip(chunk_files, (X_chunk > 0).sum(axis=1))) ## Apply Any Additional Preprocessing LOGGER.info("Generating Feature Set") X_chunk = model.preprocessor.transform(X_chunk) ## Feed Forward LOGGER.info("Computing Logits") support_ = np.multiply(X_chunk, model.model.coef_) logits = support_.sum(axis=1) + model.model.intercept_ ## Get Predictions LOGGER.info("Computing Probabilities") p = dict(zip(chunk_files, 1 / (1 + np.exp(-logits)))) ## Cache Results if interpret: X_test.append(X_chunk) support.append(support_) y_pred.update(p) n.update(n_) tn.update(tn_) tn_binary.update(tn_binary_) filtered_filenames.extend(chunk_files) ## Format Cache n = np.array([n[filename] for filename in filtered_filenames]) tn = np.array([tn[filename] for filename in filtered_filenames]) tn_binary = np.array( [tn_binary[filename] for filename in filtered_filenames]) ## Interpretation feature_range = None if interpret: ## Concatenate Features and Support if isinstance(X_test[0], csr_matrix): X_test = vstack(X_test).toarray() else: X_test = np.vstack(X_test) if isinstance(support[0], csr_matrix): support = vstack(support) else: support = np.vstack(support) ## Get Features feature_names = model.get_feature_names() ## Get Feature Range (Bootstrap used for Confidence Intervals) sample_size = int(X_test.shape[0] * bootstrap_sample_percent / 100) feature_range = [] for _ in tqdm(list(range(bootstrap_samples)), desc="Bootstrap Feature Samples", file=sys.stdout): sind = np.random.choice(X_test.shape[0], size=sample_size, replace=True) feature_range.append(support[sind].mean(axis=0)) feature_range = np.percentile(np.vstack(feature_range), [2.5, 50, 97.5], axis=0) feature_range = pd.DataFrame(feature_range.T, index=feature_names, columns=["lower", "median", "upper"]) return y_pred, feature_range, n, tn, tn_binary
def learn_vocabulary(filenames, chunksize=100, prune_rate=1, min_prune_freq=1, min_date=None, max_date=None, min_n=1, max_n=1, remove_retweets=True, binarize=False, jobs=4, pretokenized=False): """ Args: filenames (list of str): Raw data filenames chunksize (int): How many files to process in parallel before aggregating prune_rate (int): How many chunks to process before pruning vocabulary min_prune_freq (int): N-grams below this threshold at each pruning are removed """ ## Storage vocab = Counter() agg_chunk_counts = Counter() ## Initialize Multiprocessor mp = Pool(jobs) ## Initialize Tokenize/Count Function counter = partial(tokenize_and_count, min_date=min_date, max_date=max_date, min_n=min_n, max_n=max_n, remove_retweets=remove_retweets, pretokenized=pretokenized) ## Process Data filechunks = list(chunks(filenames, chunksize)) chunks_processed = 0 for chunk in tqdm(filechunks, desc="Learning Vocabulary", file=sys.stdout, position=0): ## Increment Chunk Count chunks_processed += 1 ## Apply Counter chunk_counts = list( tqdm(mp.imap_unordered(counter, chunk), total=len(chunk), desc="File", file=sys.stdout, position=1, leave=False)) ## Aggregate Counts for cc in chunk_counts: if binarize: cc = Counter({i: 1 for i in cc.keys()}) agg_chunk_counts += cc ## Update Cache if chunks_processed == prune_rate: ## Apply Pruning agg_chunk_counts = Counter({ x: y for x, y in agg_chunk_counts.items() if y >= min_prune_freq }) ## Add to General Vocab vocab += agg_chunk_counts ## Reset Chunk Counter agg_chunk_counts = Counter() chunks_processed = 0 ## Apply Final Update if len(agg_chunk_counts) > 0: ## Prune Filter agg_chunk_counts = Counter( {x: y for x, y in agg_chunk_counts.items() if y >= min_prune_freq}) ## Add to General Vocab vocab += agg_chunk_counts ## Close Pool _ = mp.close() ## Initialize Vectorizer using Learned Vocabulary global cvec cvec = initialize_vectorizer(vocab) return vocab
date_range = [start_date, end_date] else: date_range = [pd.to_datetime(start_date)] while date_range[-1] < pd.to_datetime(end_date): date_range.append( min(date_range[-1] + timedelta(freq), pd.to_datetime(end_date))) date_range = [i.date().isoformat() for i in date_range] ## Cache Directory/Plot Directory if not os.path.exists(cache_dir): os.makedirs(cache_dir) if not os.path.exists(plot_dir): os.makedirs(plot_dir) ## Get Comment Counts over Date Range author_chunks = list(chunks(sorted(counts.keys()), n=100)) for dstart, dstop in tqdm(zip(date_range[:-1], date_range[1:]), total=len(date_range) - 1, position=0, leave=False, file=sys.stdout, desc="Date Range"): for a, author_chunk in tqdm(enumerate(author_chunks), total=len(author_chunks), position=1, leave=False, file=sys.stdout, desc="Author Chunk"): ## Check Cache cache_file = f"{cache_dir}{dstart}_{dstop}_chunk-{a}.json.gz" if os.path.exists(cache_file):
vocab.get_ngrams(fd["text_tokenized"], NGRAMS[0], NGRAMS[1])) counts_vec = vocab.dvec.transform(counts).toarray() return counts_vec ## Count Cache File X_cache_file = "{}X_ngram{}-{}_{}-{}_{}.joblib".format(CACHE_DIR, NGRAMS[0], NGRAMS[1], DATE_START, DATE_END, CACHE_FREQ) ## Count/Load Language Usage if not os.path.exists(X_cache_file) or RERUN_COUNT: ## Count Usage Over Time (In Chunks) LOGGER.info("Count Language Usage") X = np.zeros((len(date_range) - 1, len(ngrams))) processed_file_chunks = list(chunks(processed_files, CHUNKSIZE)) mp = Pool(NUM_PROCESSES) for fchunk in tqdm(processed_file_chunks, desc="File Chunk", file=sys.stdout, total=len(processed_file_chunks)): fchunk_x = list(mp.imap_unordered(count_language_usage, fchunk)) fchunk_x = np.stack(fchunk_x).sum(axis=0) X += fchunk_x _ = mp.close() ## Cache _ = joblib.dump(X, X_cache_file) else: ## Load From Cache LOGGER.info("Loading Cached Language Usage") X = joblib.load(X_cache_file)