def embed_from_config(config): mkdir(config["embedding"]["output_data_directory"]) # If there is a whitelist only keep the matching filename try: whitelist = config["score"]["input_file_whitelist"] except: whitelist = [] # # Run the functions that act globally on the data for name in config["embedding"]["embedding_commands"]: obj = getattr(mb, name) # Load any kwargs in the config file kwargs = config["embedding"].copy() if name in kwargs: kwargs.update(kwargs[name]) kwargs['target_column'] = config['target_column'] func = obj(**kwargs) func.set_iterator_function(item_iterator, config["embedding"], whitelist, section="parse") func.compute(**kwargs)
def embed_from_config(config): ''' Args: config (dict): Import parameters ''' # Only load options from the embedding section target_column = config['target_column'] econfig = config['embed'] # Create any missing directories d_out = econfig['output_data_directory'] mkdir(d_out) # Train each embedding model for name in econfig["embedding_commands"]: # Load any kwargs in the config file kwargs = econfig.copy() if name in kwargs: kwargs.update(kwargs[name]) model = getattr(mb, name)(**kwargs) model.set_iterator_function(text_iterator) model.compute(target_column) f_save = os.path.join(d_out, kwargs[name]['f_db']) model.save(f_save)
def import_data_from_config(config): """ Import parameters from the config file. import_data_from_config() and phrases_from_config() are the entry points for this step of the pipeline. Args: config: a config file """ merge_columns = config["import_data"]["merge_columns"] if not isinstance(merge_columns, list): msg = "merge_columns (if used) must be a list" raise ValueError(msg) data_out = config["import_data"]["output_data_directory"] mkdir(data_out) # Require 'input_data_directories' to be a list data_in_list = config["import_data"]["input_data_directories"] if not isinstance(data_in_list, list): msg = "input_data_directories must be a list" raise ValueError(msg) target_column = config["target_column"] for d_in in data_in_list: import_directory_csv(d_in, data_out, target_column, merge_columns)
def phrases_from_config(config): """ Identify parenthetical phrases in the documents as they are being imported to the pipeline. import_data_from_config() and phrases_from_config() are the entry points for this step of the pipeline. Args: config: a config file :return: """ _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABBR = collections.Counter() INPUT_ITR = db_utils.CSV_database_iterator( F_CSV, target_column, progress_bar=True ) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABBR.update(result) logger.info("{} total abbrs found.".format(len(ABBR))) # Merge abbreviations that are similar logger.debug("Deduping abbr list.") df = dedupe_abbr(ABBR) logger.info("{} abbrs remain after deduping.".format(len(df))) # Output top phrase logger.info("Top 5 abbreviations") msg = "({}) {}, {}, {}" for k, (_, row) in enumerate(df[:5].iterrows()): logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"])) mkdir(output_dir) f_csv = os.path.join( output_dir, config["phrase_identification"]["f_abbreviations"] ) df.to_csv(f_csv)
def import_directory_csv(d_in, d_out, output_table): F_CSV = [] F_SQL = {} INPUT_FILES = grab_files("*.csv",d_in) if not INPUT_FILES: print "No matching CSV files found, exiting" exit(2) for f_csv in INPUT_FILES: f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1]) f_sql += ".sqlite" f_sql = os.path.join(d_out,f_sql) if os.path.exists(f_sql) and not _FORCE: print "{} already exists, skipping".format(f_sql) continue F_CSV.append(f_csv) F_SQL[f_csv] = f_sql # Create the output directory if needed mkdir(d_out) ITR = jobmap(load_csv, F_CSV, _PARALLEL) # Create a reference ID for each item _ref_counter = itertools.count() for (f_csv,df) in ITR: f_sql = F_SQL[f_csv] engine = create_engine('sqlite:///'+f_sql) n_data_items = len(df) df["_ref"] = [_ref_counter.next() for _ in range(n_data_items)] df.set_index("_ref",inplace=True) df.to_sql(output_table, engine, if_exists='replace') print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
def import_data_from_config(config): merge_columns = (config["import_data"]["merge_columns"] if "merge_columns" in config["import_data"] else []) if (not isinstance(merge_columns, list)): msg = "merge_columns (if used) must be a list" raise ValueError(msg) data_out = config["import_data"]["output_data_directory"] mkdir(data_out) # Require `input_data_directories` to be a list data_in_list = config["import_data"]["input_data_directories"] if (not isinstance(data_in_list, list)): msg = "input_data_directories must be a list" raise ValueError(msg) target_column = config["target_column"] for d_in in data_in_list: import_directory_csv(d_in, data_out, target_column, merge_columns)
def score_from_config(global_config): config = global_config["score"] mkdir(config["output_data_directory"]) # Run the functions that can sum over the data (eg. TF counts) for name in config["count_commands"]: model, kwargs = _load_model(name, config) logger.info("Starting mapreduce {}".format(model.function_name)) map(model, db.text_iterator()) model.save(**kwargs) # Load the reduced representation model RREP = ds.reduced_representation() # Run the functions that act per documnet (eg. word2vec) for name in config["score_commands"]: model, kwargs = _load_model(name, config) f_db = os.path.join(kwargs["output_data_directory"], kwargs["f_db"]) logger.info("Starting score model {}".format(model.method)) for f_csv in db.get_section_filenames('parse'): data = {} for row in db.text_iterator([ f_csv, ]): data[row["_ref"]] = model(row['text']) model.save(data, f_csv, f_db) # If required, compute the reduced representation if kwargs["compute_reduced_representation"]: nc = kwargs['reduced_representation']['n_components'] rdata = RREP.compute(model.method, n_components=nc) RREP.save(model.method, rdata, f_db)
def phrases_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABR = collections.Counter() dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar print("Deduping abbr list.") df = dedupe_abbr(ABR) print("{} abbrs remain after deduping".format(len(df))) # Output top phrase print("Top 5 abbreviations") print(df[:5]) mkdir(output_dir) f_csv = os.path.join(output_dir, config["phrase_identification"]["f_abbreviations"]) df.to_csv(f_csv)
row["_ref"] = _ref_counter.next() if F_CSV_OUT_HANDLE[f_csv] is None: F_CSV_OUT_HANDLE[f_csv] = csv.DictWriter( F_CSV_OUT[f_csv], sorted(row.keys())) F_CSV_OUT_HANDLE[f_csv].writeheader() F_CSV_OUT_HANDLE[f_csv].writerow(row) msg = "Imported {}, {} entries" print(msg.format(f_csv, k)) if __name__ == "__main__": import simple_config config = simple_config.load() _PARALLEL = config.as_bool("_PARALLEL") data_out = config["import_data"]["output_data_directory"] mkdir(data_out) output_table = config["import_data"]["output_table"] # Require `input_data_directories` to be a list data_in_list = config["import_data"]["input_data_directories"] assert (isinstance(data_in_list, list)) for d_in in data_in_list: import_directory_csv(d_in, data_out, output_table)
ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar print("Deduping abbr list.") ABR = dedupe_abbr(ABR) print("{} abbrs remain after deduping".format(len(ABR))) # Convert abbrs to a list data_insert = [(phrase, abbr, count) for (phrase, abbr), count in ABR.most_common()] # Convert the list to a dataframe and sort df = pd.DataFrame(data_insert, columns=("phrase", "abbr", "count")) df = df.sort_values(["count", "phrase"], ascending=False).set_index("phrase") # Output top phrase print("Top 5 abbreviations") print(df[:5]) mkdir(output_dir) f_csv = os.path.join(output_dir, config["phrase_identification"]["f_abbreviations"]) df.to_csv(f_csv)
def parse_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") import_config = config["import_data"] parse_config = config["parse"] input_data_dir = import_config["output_data_directory"] output_dir = parse_config["output_data_directory"] mkdir(output_dir) for name in parse_config["pipeline"]: obj = getattr(nlpre, name) # Load any kwargs in the config file kwargs = {} if name in parse_config: kwargs = dict(parse_config[name]) # Handle the special case of the precomputed acronyms if name == "replace_acronyms": f_abbr = os.path.join( config["phrase_identification"]["output_data_directory"], config["phrase_identification"]["f_abbreviations"]) ABBR = load_phrase_database(f_abbr) kwargs["counter"] = ABBR parser_functions.append(obj(**kwargs)) col = config["target_column"] F_CSV = grab_files("*.csv", input_data_dir) dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False) ITR = jobmap( dispatcher, INPUT_ITR, _PARALLEL, batch_size=_global_batch_size, target_column=col, ) F_CSV_OUT = {} F_WRITERS = {} for k, row in enumerate(ITR): f = row.pop("_filename") # Create a CSV file object for all outputs if f not in F_CSV_OUT: f_csv_out = os.path.join(output_dir, os.path.basename(f)) F = open(f_csv_out, 'w') F_CSV_OUT[f] = F F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col]) F_WRITERS[f].writeheader() F_WRITERS[f].writerow(row) # Close the open files for F in F_CSV_OUT.values(): F.close()
def predict_from_config(config): ERROR_MATRIX = {} PREDICTIONS = {} use_meta = config["predict"]['use_meta'] # For now, we can only deal with one column using meta! assert(len(config["predict"]["categorical_columns"]) == 1) methods = uds.get_score_methods() pred_col = config["target_column"] pred_output_dir = config["predict"]["output_data_directory"] extra_cols = config["predict"]["extra_columns"] mkdir(pred_output_dir) # Load the categorical columns df = uds.load_ORG_data(config["predict"]["categorical_columns"]) ITR = itertools.product(methods, config["predict"]["categorical_columns"]) X_META = [] cfg = config["predict"] cfg["_PARALLEL"] = config["_PARALLEL"] df_scores = None for (method, cat_col) in ITR: text = "Predicting [{}] [{}:{}]" logger.info(text.format(method, cat_col, pred_col)) DV = uds.load_document_vectors(method) X = DV["docv"] if use_meta: X_META.append(X) Y = np.hstack(df[cat_col].values) counts = np.array(collections.Counter(Y).values(), dtype=float) counts /= counts.sum() msg = " Class balance for categorical prediction: {}" logger.info(msg.format(counts)) # Determine the baseline prediction y_counts = collections.Counter(Y).values() baseline_score = max(y_counts) / float(sum(y_counts)) # Predict scores, F1, errors, pred, dfs = categorical_predict( X=X, y_org=Y, method_name=method, use_SMOTE=int(cfg['use_SMOTE']), use_PARALLEL=int(cfg['_PARALLEL']), n_estimators=int(cfg['n_estimators']), ) text = " F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})" logger.info(text.format(scores.mean(), F1.mean(), baseline_score)) PREDICTIONS[method] = pred ERROR_MATRIX[method] = errors if df_scores is None: df_scores = dfs else: df_scores[method] = dfs[method] if use_meta: # Build meta predictor # META_X = np.hstack([PREDICTIONS[method] for method # in config["predict"]["meta_methods"]]) X_META = np.hstack(X_META) method = "meta" text = "Predicting [{}] [{}:{}]" logger.info(text.format(method, cat_col, pred_col)) scores, F1, errors, pred, dfs = categorical_predict( X=X_META, y_org=Y, method_name=method, n_estimators=int(cfg['n_estimators']), use_PARALLEL=int(cfg['_PARALLEL']), ) text = " F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})" logger.info(text.format(scores.mean(), F1.mean(), baseline_score)) PREDICTIONS[method] = pred ERROR_MATRIX[method] = errors df_scores[method] = dfs[method] # Save the predictions if extra_cols: df_ORG = uds.load_ORG_data(extra_columns=extra_cols) for col in extra_cols: df_scores[col] = df_ORG[col] f_save = os.path.join(pred_output_dir, "{}_prediction.csv".format(cat_col)) df_scores.index.name = '_ref' df_scores.to_csv(f_save) names = methods if use_meta: names += ["meta", ] # Plotting methods here df = pd.DataFrame(0, index=names, columns=names) max_offdiagonal = 0 for na, nb in itertools.product(names, repeat=2): if na != nb: idx = (ERROR_MATRIX[na] == 0) * (ERROR_MATRIX[nb] == 1) max_offdiagonal = max(max_offdiagonal, idx.sum()) else: idx = ERROR_MATRIX[na] == 0 df[na][nb] = idx.sum() print(df) # Output result to stdout sns.heatmap(df, annot=True, vmin=0, vmax=1.2 * max_offdiagonal, fmt="d") plt.yticks(rotation=0) plt.xticks(rotation=45) plt.show()
yield val progress_bar.update() if not self.yield_single: yield data if __name__ == "__main__": import simple_config config = simple_config.load("score") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") n_jobs = -1 if _PARALLEL else 1 mkdir(config["output_data_directory"]) ########################################################### # Fill the pipeline with function objects mapreduce_functions = [] for name in config["mapreduce_commands"]: obj = getattr(ds,name) # Load any kwargs in the config file kwargs = {} if name in config: kwargs = config[name] # Add in the embedding configuration options
ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL) for result in ITR: ABR.update(result) msg = "Completed {} {}. {} total abbrs found." print msg.format(f_sql,column_name,len(ABR)) # Merge abbreviations that are similar print "Deduping list" ABR = dedupe_abbr(ABR) print "{} abbrs remain after deduping".format(len(ABR)) # Convert abbrs to a list data_insert = [(phrase,abbr,count) for (phrase,abbr),count in ABR.most_common()] # Convert the list to a dataframe for insert df = pd.DataFrame(data_insert, columns=("phrase","abbr","count")) mkdir(output_dir) f_sql = os.path.join(output_dir, config["f_abbreviations"]) engine = create_engine('sqlite:///'+f_sql) # Save the abbrs to a table df.to_sql(config["output_table"], engine, if_exists='replace')
def score_from_config(global_config): config = global_config["score"] mkdir(config["output_data_directory"]) # # Fill the pipeline with function objects mapreduce_functions = [] for name in config["mapreduce_commands"]: obj = getattr(ds, name) # Load any kwargs in the config file kwargs = {} if name in config: kwargs = config[name] # Add in the embedding configuration options kwargs["embedding"] = global_config["embedding"] kwargs["score"] = global_config["score"] val = name, obj(**kwargs) mapreduce_functions.append(val) col = global_config['target_column'] # Run the functions that can act like mapreduce (eg. TF counts) for name, func in mapreduce_functions: print("Starting mapreduce {}".format(func.table_name)) INPUT_ITR = db.item_iterator( config, text_column=col, progress_bar=True, include_filename=True, ) ITR = itertools.imap(func, INPUT_ITR) map(func.reduce, ITR) func.save(config) # Run the functions that act globally on the data for name in config["globaldata_commands"]: obj = getattr(ds, name) # Load any kwargs in the config file kwargs = config if name in config: kwargs.update(config[name]) # Add in the embedding configuration options func = obj(**kwargs) F_CSV = db.get_section_filenames("parse") for f_csv in F_CSV: ITR = db.single_file_item_iterator(f_csv) func.compute_single(ITR) func.save_single() func.compute_reduced_representation()
import itertools from utils.os_utils import mkdir import document_scoring as ds import simple_config from utils.db_utils import item_iterator if __name__ == "__main__": global_config = simple_config.load() _PARALLEL = global_config.as_bool("_PARALLEL") config = global_config["score"] n_jobs = -1 if _PARALLEL else 1 mkdir(config["output_data_directory"]) # # Fill the pipeline with function objects mapreduce_functions = [] for name in config["mapreduce_commands"]: obj = getattr(ds, name) # Load any kwargs in the config file kwargs = {} if name in config: kwargs = config[name] # Add in the embedding configuration options kwargs["embedding"] = global_config["embedding"]
def explain_metaclusters(config): save_dest = config["postprocessing"]["output_data_directory"] uos.mkdir(save_dest) args = config["postprocessing"]["LIME_explainer"] f_csv_out = os.path.join(save_dest, "cluster_LIME.csv") data = uds.load_metacluster_data() centroids = data["meta_centroids"] labels = data["meta_labels"] # Find out which centroids are close, and find their index locations C = cdist(centroids, centroids, metric="cosine") cidx = np.where(C > float(args["metacluster_cosine_minsim"])) n_lime_samples = int(args["n_lime_samples"]) n_lime_features = int(args["n_lime_features"]) n_estimators = int(args["n_estimators"]) INPUT_ITR = udb.text_iterator() ALL_TEXT = np.array([row["text"] for row in INPUT_ITR]) data = [] for i, j in zip(*cidx): # Only take the upper diagonal if i >= j: continue logger.info("Computing LIME for clusters {} and {}".format(i, j)) labels_i = labels == i labels_j = labels == j idx = labels_i | labels_j LE = sklearn.preprocessing.LabelEncoder() Y = LE.fit_transform(labels[idx]) n_samples = min(labels_i.sum(), labels_j.sum(), n_lime_samples) new_idx = _select_even_subset(Y, n_samples) Y = Y[new_idx] TEXT = ALL_TEXT[idx][new_idx] df = _compute_LIME(TEXT, Y, n_estimators, n_lime_features) # Remove words that contributes < 0.5% df.score /= np.abs(df.score).sum() df = df[np.abs(df.score) > 0.005] # Normalize the scores and make human friendly df.score /= np.abs(df.score).sum() df.score *= 100 class_names = LE.classes_ df["negative_class"] = class_names[0] df["positive_class"] = class_names[1] data.append(df) df = pd.concat(data).set_index(["negative_class", "positive_class"]) df.to_csv(f_csv_out)
from utils.os_utils import mkdir import model_building as mb import simple_config from utils.db_utils import item_iterator if __name__ == "__main__": config = simple_config.load() mkdir(config["embedding"]["output_data_directory"]) # If there is a whitelist only keep the matching filename try: whitelist = config["score"]["input_file_whitelist"] except: whitelist = [] # # Run the functions that act globally on the data for name in config["embedding"]["embedding_commands"]: obj = getattr(mb, name) # Load any kwargs in the config file kwargs = config["embedding"].copy() if name in kwargs: kwargs.update(kwargs[name]) kwargs['target_column'] = config['target_column'] func = obj(**kwargs) func.set_iterator_function(item_iterator,