def _compareAmbiguousPairs(self, pairs_to_use): printLogToConsole(self.console_log_level, "Comparing all ambiguous pairs", logging.INFO) self.logger.info("Comparing all ambiguous pairs") try: comparator = CompareAuthors(**self.compare_args) except Exception as e: self.logger.error("Error intializing comparator") self.logger.error("comparator_args={}".format(list(self.compare_args.keys()))) self.logger.exception(e) raise e out = {} if self.cores == 1: self.logger.debug("Using 1 core") pbar = tqdm(total=len(pairs_to_use), file=sys.stdout) for k, pairs in pairs_to_use.items(): target, compare_results = self._compareAuthors([comparator, k, pairs]) out[target] = compare_results pbar.update() pbar.close() return out else: self.logger.debug("Using {} cores".format(self.cores)) args = [[comparator, k, pairs] for k, pairs in pairs_to_use.items()] with mp.Pool(self.cores) as Pool: imap_results = list( tqdm(Pool.imap_unordered(self._compareAuthors, args), total=len(args), file=sys.stdout)) for k, res in imap_results: out[k] = res return out
def handleUserInput(self): printLogToConsole(self.console_log_level, "Found {} targets".format(len(self.targets)), logging.INFO, logger=self.logger) print("INFO: Enter target-ids:") while 1: user_input = input(">> ") if user_input == "\n" or len(user_input) == 0: self.logger.debug("Got empty user input") command_split = user_input.split(" ") command = command_split[0] if command not in self.valid_main_commands: printLogToConsole(self.console_log_level, "{} is not a valid command".format(command), logging.INFO, logger=self.logger) elif command == "e": if self.confirmAction(): return else: action = self.valid_main_commands[command]["action"] if self.valid_main_commands[command]["required"]: if len(command_split) != 2: print("Invalid arguments passed to {}".format(command)) else: action(command_split[1]) else: action() self.logger.debug( "Finished running command {}".format(command))
def _makeAmbiguousPairs(self, ambiguous_papers, check_authors, authors_to_get): printLogToConsole(self.console_log_level, "Creating pairs for ambiguous authors", logging.INFO) self.logger.info("Creating pairs for ambiguous authors") known_author_info, error_authors, error_papers = self._getAuthorInfos(authors_to_get) if error_authors > 0: self.logger.warning("{} errors getting known author infos".format(error_authors)) if error_papers > 0: self.logger.warning("{} errors getting known author papers".format(error_papers)) self.logger.debug("{} known papers".format(len(known_author_info))) self.logger.debug("{} ambiguous author ids".format(len(check_authors))) results = defaultdict(list) excluded = defaultdict(list) for a in ambiguous_papers.keys(): printLogToConsole(self.console_log_level, "Creating pairs for {}".format(a), logging.INFO) self.logger.info("Creating pairs for {}".format(a)) self.logger.debug("{} has {} papers".format(a, len(ambiguous_papers[a]))) self.logger.debug("{} has {} to check against".format(a, len(check_authors[a]))) self.logger.debug("{} has {} total possible pairs".format(a, len(ambiguous_papers) * len(check_authors[a]))) known_to_use = [[" ".join(x), known_author_info[" ".join(x)]] for x in check_authors[a]] for p in ambiguous_papers[a]: ambiguous_paper_info = getAuthorInfo([self.papers[p], a]) pairs_to_use, pairs_excluded = self._makePairs(ambiguous_paper_info, known_to_use) self.logger.debug("{} {} has {} pairs".format(p, a, len(pairs_to_use))) self.logger.debug("{} {} has {} excluded".format(p, a, len(pairs_excluded))) results[" ".join([p, a])] = pairs_to_use excluded[" ".join([p, a])] = [x[0] for x in pairs_excluded] return results, excluded
def save(self): path = os.getcwd() + self.model_save_path + self.model_name if not os.path.exists(path): os.mkdir(path) # I got permission denied when using os.path.join with open(path + "/model.pickle", "wb") as f: pickle.dump(self.model, f) parameters_dict = { "classifiers": self.classifiers, "classifier_weights": self.classifier_weights, "classifier_params": self.classifier_params, "special_only": self.special_only, "test_fraction": self.test_fraction, "rand_seed": self.rand_seed, "diff_same_ratio": self.dif_same_ratio, "cutoff": self.cutoff } with open(path + "/parameters.json", "w") as f: json.dump(parameters_dict, f, indent=4) printLogToConsole(self.console_log_level, "Saved model {} to {}".format(self.model_name, path), logging.INFO) self.logger.info("Saved model {} to {}".format(self.model_name, path))
def _getAuthorInfos(self, authors) -> (dict, int, int): out = {} printLogToConsole(self.console_log_level, "Getting author info for specified authors", logging.INFO) self.logger.info("Getting author info for specified authors") self.logger.debug("authors={}".format(authors)) error_authors = 0 error_papers = 0 pbar = tqdm(total=len(authors), file=sys.stdout) for a in authors: if a not in self.author_papers: pbar.update() self.logger.warning("{} is not in self.author_papers".format(a)) error_authors += 1 continue for p in self.author_papers[a]: if p not in self.papers: self.logger.debug("{} not in self.papers".format(p)) error_papers += 1 continue auth_key, auth_info = getAuthorInfo([self.papers[p], a]) out[auth_key] = auth_info pbar.update() pbar.close() self.logger.debug("len(out)={}".format(len(out))) self.logger.debug("error_authors={}".format(error_authors)) self.logger.debug("error_papers={}".format(error_papers)) return out, error_authors, error_papers
def _getAuthorInfo(self, a): self.logger.debug("Getting info for {}".format(a)) name = cleanName( remove_weird_notes.sub(" ", nameFromDict(self.id_to_name[a])).replace( " ", " ")).replace(" ", " ") printLogToConsole(self.console_log_level, "id={}".format(a), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "name={}".format(name), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "Papers for {}:".format(a), logging.INFO, logger=self.logger) for p in self.author_papers[a]: if p not in self.papers: continue try: title = self.papers[p].title except: title = self.papers[p]["title"] printLogToConsole(self.console_log_level, "\t{}\t{}".format(p, title), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "{} Author(s) have this name".format( len(self.names[name])), logging.INFO, logger=self.logger)
def _displayOverride(self): printLogToConsole(self.console_log_level, "Override Authors: ", logging.INFO, logger=self.logger) for k in self.override_authors.keys(): printLogToConsole(self.console_log_level, "{} has {} authors to compare with".format( k, len(self.override_authors[k])), logging.INFO, logger=self.logger)
def _clearAuthorOverride(self, a): self.logger.debug("Received clear override command".format(a)) if a not in self.override_authors: printLogToConsole( self.console_log_level, "{} does not have authors to compare with".format(a), logging.INFO, logger=self.logger) return if self.confirmAction(): del self.override_authors[a] self.logger.debug("Removed {} from override_authors".format(a))
def createModel(self, classifier_parameters): self.classifier_params = classifier_parameters printLogToConsole(self.console_log_level, "Creating model", logging.INFO) self.logger.info("Creating model") self.logger.debug("{} estimators".format(len(self.classifiers))) weights = [] for n, m in self.classifiers: self.logger.debug("n={}".format(n)) self.logger.debug("m={}".format(m)) weights.append(self.classifier_weights[n]) if n not in classifier_parameters: self.logger.error( "{} is not in classifier_parameters".format(n)) raise KeyError("{} is not in classifier_parameters".format(n)) if m == "GaussianNB": self.estimators.append( (n, GaussianNB(**classifier_parameters[n]))) elif m == "KNeighborsClassifier": self.estimators.append( (n, KNeighborsClassifier(**classifier_parameters[n]))) elif m == "MLPClassifier": self.estimators.append( (n, MLPClassifier(**classifier_parameters[n]))) elif m == "SVC": self.estimators.append((n, SVC(**classifier_parameters[n]))) elif m == "RBF": self.estimators.append((n, RBF(**classifier_parameters[n]))) elif m == "RandomForestClassifier": self.estimators.append( (n, RandomForestClassifier(**classifier_parameters[n]))) elif m == "AdaBoostClassifier": self.estimators.append( (n, AdaBoostClassifier(**classifier_parameters[n]))) elif m == "QuadraticDiscriminantAnalysis": self.estimators.append((n, QuadraticDiscriminantAnalysis( **classifier_parameters[n]))) elif m == "DecisionTreeClassifier": self.estimators.append( (n, DecisionTreeClassifier(**classifier_parameters[n]))) elif m == "GaussianProcessClassifier": self.estimators.append( (n, GaussianProcessClassifier(**classifier_parameters[n]))) else: self.logger.error("Unknown classifier") raise ValueError("{} is not a supported classifier".format(m)) self.model = VotingClassifier(self.estimators, voting=self.voting, weights=weights)
def trainModel(self, voting="hard"): printLogToConsole(self.console_log_level, "Training model", logging.INFO) self.logger.info("Training model") if self.special_only: self.logger.debug("Training on special data") self.logger.debug("Test cases: {}".format( len(self.special_train["X"]))) train = self.special_train else: self.logger.debug("Training on all data") self.logger.debug("Test cases: {}".format(len(self.train["X"]))) train = self.train X = train["X"] Y = train["Y"] if self.train_all_estimators: for n, m in self.estimators: t0 = time.time() m.fit(X, Y) t1 = time.time() progress_str = "Finished fitting classifier {} in {:.2f}s".format( n, t1 - t0) printLogToConsole(self.console_log_level, progress_str, logging.INFO) self.logger.info(progress_str) printLogToConsole(self.console_log_level, "Fitting the VotingClassifier Model", logging.INFO) self.logger.info("Fitting the VotingClassifier Model") self.model.fit(X, Y) printLogToConsole(self.console_log_level, "Finished fitting model", logging.INFO) self.logger.info("Finished fitting model")
def fillData(self): printLogToConsole(self.console_log_level, "Adding rest of data to new author papers", logging.INFO) self.logger.info("Adding rest of data to new author papers") skipped_papers = 0 # auth_pbar = tqdm(total=len(self.author_papers), file=sys.stdout) # skipped_old_ids = 0 # skipped_author_papers = 0 # for a in self.author_papers.keys(): # if a in self.new_author_papers: # if self.new_author_papers[a] != self.author_papers[a]: # papers_to_add = [x for x in self.author_papers[a] if x not in self.new_author_papers[a]] # self.new_author_papers[a].extend(papers_to_add) # self.logger.debug("{} is in new author papers, but need to add {} papers".format(a,len(papers_to_add))) # # else: # skipped_author_papers+=1 # # self.logger.debug("Skipping author {}, in new_author_papers".format(a)) # elif a in self.old_ids: # if self.remove_all_papers: # skipped_old_ids+=1 # # self.logger.debug("Skipping author {}, in old_ids".format(a)) # else: # if a not in self.id_to_name: # self.logger.warning("{} is in author_papers but not in id_to_name".format(a)) # else: # self.new_author_papers[a] = deepcopy(self.author_papers[a]) # self.new_id_to_name[a] = self.id_to_name[a] # auth_pbar.update() # auth_pbar.close() # self.logger.debug("Skipped {} authors due to being in new_author_papers".format(skipped_author_papers)) # self.logger.debug("Skipped {} authors due to being in old_ids".format(skipped_old_ids)) printLogToConsole(self.console_log_level, "Adding papers", logging.INFO, logger=self.logger) paper_pbar = tqdm(total=len(self.papers), file=sys.stdout) for pid, paper in self.papers.items(): if pid in self.error_papers: self.logger.debug("{} is in error_papers, but not in self.new_papers".format(pid)) else: if pid in self.new_papers: paper = self.new_papers[pid] else: self.new_papers[pid] = paper for a in paper.affiliations.keys(): if a not in self.new_id_to_name: self.new_id_to_name[a] = self.id_to_name[a] if pid not in self.new_author_papers[a]: self.new_author_papers[a].append(pid) paper_pbar.update() paper_pbar.close() return self.new_papers, self.new_author_papers, self.new_id_to_name
def _makePredictions(self, author_arrays): printLogToConsole(self.console_log_level, "Predicting same authors", logging.INFO) self.logger.info("Predicting same authors") predictions = defaultdict(dict) probabilities = defaultdict(dict) pbar = tqdm(total=len(author_arrays), file=sys.stdout) for target, info in author_arrays.items(): pbar.write("INFO: Predicting same authors to {}".format(target)) self.logger.info("Predicting same authors to {}".format(target)) for author, results in info.items(): self.logger.debug("Making predictions for {}".format(author)) predictions[target][author] = self.model.predict(results).tolist() try: probabilities[target][author] = self.model.predict_proba(results).tolist() except: self.logger.warning("Could not get probabilities for {} - {} ".format(target, author)) pbar.update() pbar.close() return predictions, probabilities
def _genAuthorOverride(self, a): self.logger.debug("Received clear override command".format(a)) if a in self.override_authors: printLogToConsole( self.console_log_level, "{} already has authors to compare with".format(a), logging.INFO, logger=self.logger) return elif a not in self.targets: printLogToConsole(self.console_log_level, "{} is not a target".format(a), logging.INFO, logger=self.logger) return name = cleanName( remove_weird_notes.sub(" ", nameFromDict(self.id_to_name[a])).replace( " ", " ")).replace(" ", " ") print("INFO: Other authors with the same name:") for other_a in self.names[name]: if other_a != a: print("INFO: {}".format(other_a)) if len(self.names[name]) == 1: printLogToConsole( self.console_log_level, "{} only has {}, will not add authors to compare with".format( name, a), logging.INFO, logger=self.logger) else: self.override_authors[a] = [x for x in self.names[name] if x != a] self.logger.debug("{} authors added to override_authors".format( len(self.names[name])))
def __call__(self, target_authors, override_authors=None, evaluation_mode=False): if not override_authors: override_authors = {} override_authors_len = len(override_authors) self.logger.debug("__call__ called with arguments: ") self.logger.debug("\tlen(target_authors)={}".format(len(target_authors))) self.logger.debug("\tlen(override_authors)={}".format(len(override_authors))) printLogToConsole(self.console_log_level, "Starting Disambiguation", logging.INFO) self.logger.info("Starting Disambiguation") has_authors, needs_authors = self._errorCheckCallArgs(target_authors, override_authors) ambiguous_authors_res = self._makeAmbiguousAuthors(has_authors, needs_authors, override_authors) ambiguous_papers, ambiguous_names, check_authors, authors_to_get, excluded_authors = ambiguous_authors_res self.logger.debug("{} authors had no similar authors".format(len(excluded_authors))) ambiguous_papers_to_use = {x: ambiguous_papers[x] for x in ambiguous_papers if x not in excluded_authors} to_compare, excluded = self._makeAmbiguousPairs(ambiguous_papers_to_use, check_authors, authors_to_get) # initialize it here so that even if not using self.same_paper_diff_people, it can run without any errors known_different = {} if self.same_paper_diff_people: self.logger.debug("Removing excluded") to_compare, known_different = self._removeKnownDifferent(to_compare, excluded) compare_results = self._compareAmbiguousPairs(to_compare) compare_results = self._consolidateResults(compare_results) predictions, probabilities = self._makePredictions(compare_results) if self.use_probabilities: to_use = {} for k, info in probabilities.items(): to_use[k] = {x: [y[1] for y in info[x]] for x in info.keys()} else: to_use = predictions warning_auth = [] correct_dict = defaultdict(dict) printLogToConsole(self.console_log_level, "Determining the correct author", logging.INFO) self.logger.info("Determining the correct author") pbar = tqdm(total=len(predictions), file=sys.stdout) for k, pred in to_use.items(): self.logger.debug("{}") correct, above_thres = self._determineCorrectAuthor(pred, evaluation_mode) correct_dict[k]["same"] = correct correct_dict[k]["different"] = [x for x in pred.keys() if x != correct] self.logger.debug("{} was determined to be the same as {}".format(k, correct)) if evaluation_mode: correct_dict[k]["percent_same"] = above_thres if len(above_thres) != 1 and not evaluation_mode: self.logger.debug("Added {} to warnings".format(k)) warning_auth.append([k, above_thres]) correct_dict[k]["papers_affected"] = ambiguous_papers[k] pbar.update() pbar.close() printLogToConsole(self.console_log_level, "Writing results to results.json", logging.INFO) self.logger.info("Writing results to results.json") with open("results.json", "w") as f: json.dump(correct_dict, f, indent=4, sort_keys=True) return correct_dict
def _consolidateResults(self, compare_results): printLogToConsole(self.console_log_level, "Consolidating Compare results", logging.INFO) self.logger.info("Consolidating Compare results") out = {} pbar = tqdm(total=len(compare_results), file=sys.stdout) for k, results in compare_results.items(): pid, k_id = k.split(" ") if k_id not in out: out[k_id] = defaultdict(list) for _id, id_results in results.items(): out[k_id][_id].extend(id_results) pbar.update() pbar.close() self.logger.debug("Converting to np arrays") author_compare_results = {} for author, info in out.items(): author_results = {} self.logger.debug("Converting {} results".format(author)) for other_id, compare_arrays in info.items(): self.logger.debug("Consolidating results from {}".format(other_id)) if any([1 for x in compare_arrays if len(x) != self.compare_terms]): self.logger.error( "A compare result from {}-{} does not have the correct number of terms".format(author, other_id)) self.logger.error("Lengths are: {}".format([len(x) for x in compare_results])) self.logger.error("Expected length is: {}".format(self.compare_terms)) raise ValueError("Compare results length does not match comparator's result length") try: author_results[other_id] = np.array(compare_arrays) except Exception as e: self.logger.warning( "Ran into exception {} when converting compare_results, trying array by array".format(e)) tmp_arrays = [] for a in compare_results: tmp_arrays.append(np.asarray(a)) author_results[other_id] = np.asarray(tmp_arrays).reshape(len(compare_arrays), self.compare_terms) author_compare_results[author] = author_results return author_compare_results
def _prepareData(self, separated, paper_auth_info, algorithm): special_cases_dict = self._getSpecialCases(separated) same = [] different = [] special_same = [] special_diff = [] sorted_keys = sorted(list(separated.keys())) for k in sorted_keys: info = separated[k] printLogToConsole( self.console_log_level, "Creating pairs for authors starting with {}".format(k), logging.INFO) self.logger.log( logging.INFO, "Creating pairs for authors starting with {}".format(k)) special_cases = None if k in special_cases_dict: special_cases = special_cases_dict[k] auth_info = [(x, paper_auth_info[x]) for x in info] tmp_same, tmp_diff = self._makeCombinations( auth_info, special_cases) gc.collect() self.logger.debug("{} pairs to add to same".format(len(tmp_same))) self.logger.debug("{} pairs to add to different".format( len(tmp_diff))) same.extend([[1, p] for p in tmp_same]) different.extend([[0, p] for p in tmp_diff]) self.logger.debug("{} same pairs".format(len(same))) self.logger.debug("{} different pairs".format(len(different))) printLogToConsole(self.console_log_level, "Handling special cases", logging.INFO) self.logger.log(logging.INFO, "Handling special cases") for k, info in special_cases_dict.items(): printLogToConsole( self.console_log_level, "Creating pairs for special cases that start with {}".format( k), logging.INFO) self.logger.log( logging.INFO, "Creating pairs for special cases that start with {}".format( k)) auth_info = [(x, paper_auth_info[x]) for x in info] tmp_same, tmp_diff = self._makeCombinations(auth_info, algorithm, use_cutoff=False) special_same.extend([[1, p] for p in tmp_same]) special_diff.extend([[0, p] for p in tmp_diff]) return same, different, special_same, special_diff
def _selectPairsToUse(self, same, diff): printLogToConsole(self.console_log_level, "Selecting pairs to use", logging.DEBUG) self.logger.log(logging.INFO, "Selecting pairs to use") self.logger.debug("len(same) -> {}".format(len(same))) self.logger.debug("len(different) -> {}".format(len(diff))) len_same = len(same) len_diff = len(diff) if len_same > len_diff: pair_count = int(len_diff * self.dif_same_ratio) else: pair_count = int(len_same * self.dif_same_ratio) self.logger.debug("pair_count -> {}".format(pair_count)) if self.pair_distribution == "similarity": printLogToConsole(self.console_log_level, "Using similarity distribution", logging.INFO) self.logger.log(logging.INFO, "Using similarity distribution") printLogToConsole( self.console_log_level, "Similarity distribution is not implemented yet", logging.CRITICAL) self.logger.log(logging.ERROR, "Similarity distribution is not implemented yet") # TODO: Implement similarity distribution raise ValueError("Similarity distribution is not implemented yet") elif self.pair_distribution == "random": printLogToConsole(self.console_log_level, "Using random selection", logging.INFO) self.logger.log(logging.INFO, "Using random selection") try: out_same = random.sample(same, pair_count) except: out_same = same[:pair_count] try: out_diff = random.sample(diff, pair_count) except: out_diff = diff[:pair_count] return out_same, out_diff
def _createTrainTest(self, data): printLogToConsole(self.console_log_level, "Creating train and test", logging.INFO) self.logger.info("Creating train and test") same, different, special_same, special_different = self._parseData( data) def saveData(d, file_path): with open(file_path, "wb") as f: to_save = [x for x in d] pickle.dump(to_save, f) if self.save_data: saveData(same, self.save_path + "/same.pickle") saveData(different, self.save_path + "/different.pickle") saveData(special_same, self.save_path + "/special_same.pickle") saveData(special_different, self.save_path + "/special_different.pickle") all_pairs = [] for k, t, _ in [ *same, *different, *special_same, *special_different ]: all_pairs.append([k, t]) with open(self.save_path + "/save_pairs.pickle", "wb") as f: pickle.dump(all_pairs, f) same = self.convertToUsable(same) different = self.convertToUsable(different) special_same = self.convertToUsable(special_same) special_different = self.convertToUsable(special_different) same, different = self._selectPairsToUse(same, different) special_same, special_different = self._selectPairsToUse( special_same, special_different) train, test = self._splitTrainTest(same, different, special_same, special_different) special_train, special_test = self._splitTrainTest( special_same, special_different) printLogToConsole(self.console_log_level, "Splitting non-special pairs", logging.INFO) self.logger.info("Splitting non-special pairs") printLogToConsole(self.console_log_level, "Splitting special pairs", logging.INFO) self.logger.info("Splitting special pairs") return train, test, special_train, special_test
def __call__(self, pairs_to_use=None, authors_to_use=None, debug_retrieve_info=None, get_info_all=False, debug_asserts=False): gc.collect() total_run_start = time.time() if pairs_to_use and authors_to_use: self.logger.warning( "Both pairs_to_use and authors_to_use were passed, pairs_to_use will override authors_to_use" ) if authors_to_use is None: authors_to_use = [] override_pairs_to_use = False if pairs_to_use is None: pairs_to_use = [] override_pairs_to_use = True tasks, out, ignored, excluded = self._populateConstants() results = [] paper_auth_info = {} """ Initialize data """ printLogToConsole(self.console_log_level, "Getting author info", logging.INFO) self.logger.log(logging.INFO, "Getting author info") below_cutoff = 0 with tqdm(total=tasks, file=sys.stdout) as pbar: for i in out: add_author = False if authors_to_use and i[1] in authors_to_use: add_author = True elif len(self.valid_author_papers[i[1]] ) >= self.author_cutoff or i[1] in self.special_keys: add_author = True elif len(self.valid_author_papers[i[1]]) < self.author_cutoff: below_cutoff += 1 if i[1] in self.special_keys: add_author = True if add_author: pair_key, res = getAuthorInfo(i) # results.append((pair_key, res)) paper_auth_info[pair_key] = res pbar.update() pbar.close() self.logger.debug("{} Authors below cutoff".format(below_cutoff)) """ Separate the authors by the first char(s) that appear in their ids, to reduce the number of pointless pairs """ printLogToConsole(self.console_log_level, "Separating keys by chars in name", logging.INFO) self.logger.log(logging.INFO, "Separating keys by chars in name") separated_keys = self._createPairDict(list(paper_auth_info.keys()), self.separate_chars, self.separate_words) """ Create the pairs needed and put the special cases into their own arrays, then select the pairs to use based on the ratio defined in initialization to ensure a controlled number of same to different. This DOES NOT occur to any special cases. If pairs_to_use is defined, skip and use predefined pairs """ if not pairs_to_use: printLogToConsole(self.console_log_level, "Creating pairs", logging.INFO) self.logger.log(logging.INFO, "Creating pairs") same, diff, special_same, special_diff = self._prepareData( separated_keys, paper_auth_info, self.algorithm) self.logger.debug("len(same) = {}".format(len(same))) self.logger.debug("len(different) = {}".format(len(diff))) self.logger.debug("len(special_same) = {}".format( len(special_same))) self.logger.debug("len(special_different) = {}".format( len(special_diff))) if not get_info_all: self.logger.debug("Splitting pairs") same, diff = self._selectPairsToUse(same, diff) else: self.logger.debug("Getting all info") pairs_to_use = [*same, *diff, *special_same, *special_diff] else: printLogToConsole(self.console_log_level, "Using passed pairs", logging.INFO) self.logger.log(logging.INFO, "Using passed pairs") same = [] diff = [] special_same = [] special_diff = [] for t, pair_data in pairs_to_use: is_special = False for special_case in self.special_keys: if special_case in pair_data[ 1] or special_case in pair_data[2]: is_special = True if t == 1: if is_special: special_same.append([t, pair_data]) else: same.append([t, pair_data]) elif t == 0: if is_special: special_diff.append([t, pair_data]) else: diff.append([t, pair_data]) """ Take the pairs and get the info needed for them. This is done here in order to save runtime memory """ to_use = [] printLogToConsole(self.console_log_level, "Retrieving info for pairs", logging.INFO) self.logger.log(logging.INFO, "Retrieving info for pairs") for p in pairs_to_use: try: tag, pair_info = p except ValueError as e: self.logger.error( "Error raised when retrieving info for pairs") self.logger.error( "Issue with value unpacking for p when iterating over pairs_to_use. expected 2 got {}" .format(len(p))) self.logger.error("p: {}".format(p)) self.logger.exception(e) raise e # just to make warnings stop try: key, a, b = pair_info except ValueError as e: self.logger.error("Error raised when unpacking pair info") self.logger.error( "Issue with value unpacking for pair_info. expected 3 got {}" .format(len(pair_info))) self.logger.error("pair_info: {}".format(pair_info)) self.logger.exception(e) raise e to_use.append([key, tag, paper_auth_info[a], paper_auth_info[b]]) if debug_retrieve_info: return to_use random.shuffle(to_use) """ Compare those pairs, print out result stats, and pickle the data """ comparator = CompareAuthors(**self.compare_args) printLogToConsole(self.console_log_level, "Comparing authors", logging.INFO) self.logger.log(logging.INFO, "Comparing authors") if self.cores == 1 or len(to_use) < 20000: pbar = tqdm(total=len(to_use), file=sys.stdout) for i in to_use: results.append(comparator(i)) pbar.update() pbar.close() else: printLogToConsole( self.console_log_level, "Comparing {} pairs in parallel".format(len(to_use)), logging.INFO) self.logger.info("Comparing {} pairs in parallel".format( len(to_use))) batches = chunks(to_use, self.compare_batch_size) batch_count = len(to_use) // self.compare_batch_size if len(to_use) % self.compare_batch_size != 0: batch_count += 1 self.logger.debug("{} batches".format(batch_count)) with mp.Pool(self.cores) as Pool: imap_results = list( tqdm(Pool.imap_unordered(comparator.processBatch, batches), total=batch_count, file=sys.stdout)) self.logger.debug("Combining results from pool") for res in imap_results: results.extend(res) total_run_end = time.time() hours, rem = divmod(total_run_end - total_run_start, 3600) minutes, seconds = divmod(rem, 60) stats = [ ["Total Pairs Used", len(to_use)], ["Same", len(same)], ["Different", len(diff)], ["Special Same", len(special_same)], ["Special Different", len(special_diff)], ] printStats("Results", stats, line_adaptive=True) printLogToConsole( self.console_log_level, "Total Run time: {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds), logging.INFO) self.logger.info("Total Run Time: {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds)) if self.save_data: printLogToConsole(self.console_log_level, "Writing author_papers.json", logging.INFO, logger=self.logger) with open(self.json_path + "/author_papers.json", "w") as f: json.dump(self.all_author_papers, f, indent=4, sort_keys=True) printLogToConsole(self.console_log_level, "Pickling results", logging.INFO, logger=self.logger) with open(self.pickle_path + "/tagged_pairs.pickle", "wb") as f: pickle.dump(results, f)
def _removeTarget(self): self.logger.debug("Received remove command") print( "INFO: Select the number of the id you would like to remove from targets, enter e to exit" ) if len(self.targets) == 0: printLogToConsole(self.console_log_level, "No possible targets to remove", logging.INFO, logger=self.logger) return while 1: for i, v in enumerate(self.targets): print("INFO: [{}] {}".format(i, v)) to_remove = input(">>") if to_remove == "e": self.logger.debug("Exit command received") return try: to_remove = int(to_remove) except ValueError: printLogToConsole(self.console_log_level, "{} is not valid".format(to_remove), logging.INFO, logger=self.logger) continue if to_remove < 0: printLogToConsole(self.console_log_level, "{} is not valid".format(to_remove), logging.INFO, logger=self.logger) continue elif to_remove >= len(self.targets): printLogToConsole(self.console_log_level, "{} is not valid".format(to_remove), logging.INFO, logger=self.logger) continue else: printLogToConsole(self.console_log_level, "{} is selected to be removed".format( self.targets[to_remove]), logging.INFO, logger=self.logger) if self.confirmAction(): self.targets.remove(self.targets[to_remove]) printLogToConsole(self.console_log_level, "Remaining targets:", logging.INFO, logger=self.logger) for i, v in enumerate(self.targets): print("INFO: {}".format(v)) return else: self.logger.debug("User did not confirm action")
def _addTarget(self, a): self.logger.debug("Received add command with arguments {}".format(a)) valid_author = self._validAuthor(a) if valid_author == -1: printLogToConsole(self.console_log_level, "{} is not a valid author id".format(a), logging.INFO, logger=self.logger) elif valid_author == -2: printLogToConsole(self.console_log_level, "{} has no parsed papers".format(a), logging.INFO, logger=self.logger) else: self._getAuthorInfo(a) self.targets.append(a) if a not in self.override_authors: printLogToConsole( self.console_log_level, "{} does not have specified authors to compare with". format(a), logging.INFO, logger=self.logger) else: printLogToConsole(self.console_log_level, "{} has {} authors to compare with".format( a, len(self.override_authors[a])), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "{} added to targets".format(a), logging.INFO, logger=self.logger) printLogToConsole(self.console_log_level, "{} current targets".format(len(self.targets)), logging.INFO, logger=self.logger) return
def __init__(self, papers=None, author_papers=None, compare_args=None, id_to_name=None, console_log_level=logging.ERROR, file_log_level=logging.DEBUG, log_format=None, log_path=None, save_data=False, ext_directory=False, save_path=None, threshold=.2, name_similarity_cutoff=.92, str_algorithm="jaro-similarity", model=None, model_name="VC1", model_path=None, create_new_author=False, compare_cutoff=3, tie_breaker="max", cores=4, DEBUG_MODE=False, sim_overrides=False, allow_authors_not_in_override=True, same_paper_diff_people=True, use_probabilities=False): if not log_format: log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s' if not log_path: log_path = os.getcwd() + "/logs/disambiguation.log" self.logger = createLogger("author_disambiguation", log_path, log_format, console_log_level, file_log_level) self.console_log_level = console_log_level self.model = model self.model_name = model_name if self.model is None: if not model_path: model_path = os.getcwd() self.model = pickle.load(open("{}/models/{}/model.pickle".format(model_path, model_name), "rb")) try: if self.model.voting == "hard" and use_probabilities: self.logger.warning("hard voting does not support probabilities") self.use_probabilities = False else: self.use_probabilities = use_probabilities except Exception as e: self.logger.debug("model does not have voting") self.use_probabilities = False if not DEBUG_MODE: # Argument validation if compare_args and not isinstance(compare_args, dict): self.logger.error("passed compare_args is not valid") self.logger.exception(TypeError("compare_args is not a dict")) raise TypeError("compare_args is not a dict") elif not compare_args: self.logger.error("passed compare_args is not valid") self.logger.exception(ValueError("compare_args is None")) raise ValueError("compare_args is None") else: self.compare_args = compare_args if author_papers and (not isinstance(author_papers, dict) and not isinstance(author_papers, defaultdict)): self.logger.error("passed author_papers is not valid") self.logger.error("type is {}".format(type(author_papers))) self.logger.exception(TypeError("author_papers is not a dict")) raise TypeError("author_papers is not a dict") elif not author_papers: author_papers, status, error_msg = self._findData("author_papers.json") if status != 0: self.logger.error( "passed author_papers is not valid and could not find the file author_papers.json") self.logger.error("self._findData(\"author_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid author_papers found")) raise ValueError("No valid author_papers found") else: self.author_papers = deepcopy(author_papers) else: self.author_papers = deepcopy(author_papers) if papers and not isinstance(papers, dict): self.logger.error("passed papers is not valid") self.logger.exception(TypeError("papers is not a dict")) raise TypeError("papers is not a dict") elif not papers: papers, status, error_msg = self._findData("parsed_papers.json") if status != 0: self.logger.error("passed papers is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"parsed_papers.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid parsed_papers found")) raise ValueError("No valid parsed_papers found") else: if len(papers) == 0: self.logger.exception(ValueError("Found papers is empty")) raise ValueError("Found papers is empty") self.logger.debug("Converting papers from dict to Paper object") self.papers = {} for k, info in papers.items(): self.papers[k] = Paper(**info) else: if len(papers) == 0: self.logger.exception(ValueError("Passed papers is empty")) raise ValueError("Passed papers is empty") test_key = list(papers.keys())[0] if isinstance(test_key, dict): self.papers = {} for k, info in papers.items(): try: self.papers[k] = Paper(**info) except Exception as e: self.logger.error("Exception raised when converting paper dicts to Paper") self.logger.error("k={}".format(k)) self.logger.error("info={}".format(info)) self.logger.exception(e) raise e else: self.papers = papers if id_to_name and not isinstance(id_to_name, dict): self.logger.error("passed id_to_name is not valid") self.logger.exception(TypeError("id_to_name is not a dict")) raise TypeError("id_to_name is not a dict") elif not id_to_name: id_to_name, status, error_msg = self._findData("id_to_name.json") if status != 0: self.logger.error("passed id_to_name is not valid and could not find the file parsed_papers.json") self.logger.error("self._findData(\"id_to_name.json\") returned error {}".format(error_msg)) self.logger.exception(ValueError("No valid id_to_name found")) raise ValueError("No valid id_to_name found") else: if len(id_to_name) == 0: self.logger.exception(ValueError("Found id_to_name is empty")) raise ValueError("Found id_to_name is empty") self.id_to_name = id_to_name else: if len(id_to_name) == 0: self.logger.exception(ValueError("Passed id_to_name is empty")) raise ValueError("Passed id_to_name is empty") self.id_to_name = id_to_name else: printLogToConsole(self.console_log_level, "RUNNING IN DEBUG_MODE!", logging.WARNING) self.logger.warning("Running in DEBUG_MODE") self.id_to_name = id_to_name if id_to_name else {} self.papers = papers if papers else {} self.compare_args = compare_args if compare_args else {} self.author_papers = author_papers if author_papers else {} self.compare_terms = len(CompareAuthors.compare_terms) self.save_data = save_data self.save_dir = save_path self.ext_directory = ext_directory self.threshold = threshold self.name_similarity_cutoff = name_similarity_cutoff algo_name, measure = str_algorithm.split("-") self.author_name = {x: nameFromDict(self.id_to_name[x]) for x in self.id_to_name.keys()} self.cores = cores self.str_algorithm = getAlgo(algo_name, measure) self.create_new_author = create_new_author self.compare_cutoff = compare_cutoff self.tie_breaker = tie_breaker self.sim_overrides = sim_overrides self.allow_authors_not_in_override = allow_authors_not_in_override self.same_paper_diff_people = same_paper_diff_people self.logger.debug("AuthorDisambiguation initialized with arguments:") self.logger.debug("\tcompare_args={}".format(list(self.compare_args.keys()))) self.logger.debug("\talgorithm={}".format(algo_name)) self.logger.debug("\tmeasure={}".format(measure)) self.logger.debug("\tthreshold={}".format(threshold)) self.logger.debug("\tname_similarity_cutoff={}".format(name_similarity_cutoff)) self.logger.debug("\tunique authors={}".format(len(self.author_papers))) self.logger.debug("\tcompare_cutoff={}".format(self.compare_cutoff)) self.logger.debug("\ttie_breaker={}".format(self.tie_breaker)) self.logger.debug("\tsim_overrides={}".format(self.sim_overrides)) self.logger.debug("\tsame_paper_diff_people={}".format(self.same_paper_diff_people)) self.logger.debug("\tuse_probabilities={}".format(self.use_probabilities)) if self.compare_cutoff != 3: self.logger.warning("Non-default value for compare_cutoff, currently this is not implemented")
def evaluate(self): printLogToConsole(self.console_log_level, "Evaluating model", logging.INFO) self.logger.info("Evaluating model") if self.train_all_estimators: predictions = {} special_predictions = {} for n, m in self.estimators: self.logger.debug("Making predictions for {}".format(n)) predictions[n] = m.predict(self.test["X"]) special_predictions[n] = m.predict(self.special_test["X"]) printLogToConsole(self.console_log_level, "Results for all estimators", logging.INFO) self.logger.info("Results for all estimators") if not self.special_only: printLogToConsole( self.console_log_level, "First stat line is on normal test, second is for special cases", logging.INFO) column_str = "{} {:>11} {:>11} {:>11}".format( " " * 25, "precision", "recall", "f1-score") printLogToConsole(self.console_log_level, column_str, logging.INFO) self.logger.info(column_str) for k, pred in predictions.items(): precision, recall, _, _ = precision_recall_fscore_support( self.test["Y"], pred, average="binary") f1 = f1_score(self.test["Y"], pred, average="binary") stat_str = "{:<25} {:>11.2f} {:>11.2f} {:>11.2f}".format( k + ":", precision, recall, f1) printLogToConsole(self.console_log_level, stat_str, logging.INFO) self.logger.info(stat_str) if self.special_only: continue precision, recall, _, _ = precision_recall_fscore_support( self.special_test["Y"], special_predictions[k], average="binary") f1 = f1_score(self.special_test["Y"], special_predictions[k], average="binary") stat_str = "{:<25} {:>11.2f} {:>11.2f} {:>11.2f}".format( k + ":", precision, recall, f1) printLogToConsole(self.console_log_level, stat_str, logging.INFO) self.logger.info(stat_str) model_predictions = self.model.predict(self.test["X"]) printLogToConsole(self.console_log_level, "Model stats on test data:", logging.INFO) self.logger.info("Model stats on test data") stats = classification_report(self.test["Y"], model_predictions, target_names=["Different", "Same"]) print(stats) self.logger.info(stats) if not self.special_only: model_predictions = self.model.predict(self.special_test["X"]) printLogToConsole(self.console_log_level, "Model stats on special cases data:", logging.INFO) self.logger.info("Model stats on special cases data") stats = classification_report(self.special_test["Y"], model_predictions, target_names=["Different", "Same"]) print(stats) self.logger.info(stats)
def _makeAmbiguousAuthors(self, has_authors, needs_authors, override_authors): ambiguous_author_papers = defaultdict(list) ambiguous_author_names = dict() authors_get_info = list() check_author_keys = defaultdict(list) excluded = [] for i in [*has_authors, *needs_authors]: ambiguous_author_papers[i] = self.author_papers.pop(i) try: ambiguous_author_names[i] = cleanName(nameFromDict(self.id_to_name[i])).lower() del self.author_name[i] except KeyError as e: self.logger.warning("{} is not in id_to_name".format(i)) excluded.append(i) for a in has_authors: if a in excluded: self.logger.debug("Skipping {} because it is in excluded".format(a)) continue authors_get_info.extend(override_authors[a]) check_author_keys[a] = self._makeCheckAuthors(override_authors[a]) args = [] for a in needs_authors: if a in excluded: self.logger.debug("Skipping {} because it is in excluded".format(a)) continue args.append([a, ambiguous_author_names[a], self.author_name, self.str_algorithm, self.name_similarity_cutoff, self.sim_overrides]) printLogToConsole(self.console_log_level, "Getting similar authors in parallel with {} cores".format(self.cores), logging.INFO) self.logger.info("Getting similar authors in parallel with {} cores".format(self.cores)) sim_authors = [] with mp.Pool(self.cores) as Pool: imap_results = list(tqdm(Pool.imap_unordered(self._getSimilarAuthors, args), total=len(args), file=sys.stdout)) for target, auth, warnings, debug in imap_results: self.logger.debug("Adding authors from {}".format(target)) self.logger.debug("len(auth)={}".format(len(auth))) sim_authors.append([target, auth]) for i in warnings: self.logger.warning(i) for i in debug: self.logger.debug(i) self.logger.debug("len(sim_authors)={}".format(len(sim_authors))) pbar = tqdm(total=len(sim_authors), file=sys.stdout) for a, auths in sim_authors: if a in override_authors: self.logger.exception(ValueError("{} is in need authors, but is already in override_authors".format(a))) raise ValueError("{} is in need authors, but is already in override_authors".format(a)) pbar.write("INFO: Checking similar authors to {}".format(a)) self.logger.info("Checking similar authors to {}".format(a)) if len(auths) == 0: self.logger.warning("{} has no similar authors".format(a)) excluded.append(a) else: authors_get_info.extend(auths) check_author_keys[a] = self._makeCheckAuthors(auths) if len(check_author_keys[a]) == 0: self.logger.debug("{} had at least 1 similar author, but nothing in check author keys".format(a)) pbar.update() pbar.close() authors_get_info = list(set(authors_get_info)) return ambiguous_author_papers, ambiguous_author_names, check_author_keys, authors_get_info, excluded
def _makeCombinations(self, i, special_cases=None, use_cutoff=True): if not special_cases: special_cases = [] self.logger.debug("len(special_cases)={}".format(len(special_cases))) infos = {x[0]: x[1] for x in i} keys = list(infos.keys()) same = {} different = {} name_cutoff = self.name_similarity_cutoff if use_cutoff else 0 combinations = [] pair_creator_pbar = tqdm(total=int(ncr(len(keys), 2)), file=sys.stdout) max_size_allocate = 0 estimated_size_to_allocate = 0 special_cases_combos = [] for i, a in enumerate(keys): for j, b in enumerate(keys[1 + i:]): a_paper, a_id = a.split(" ") b_paper, b_id = b.split(" ") if a_id in special_cases and b_id in special_cases: special_cases_combos.append( [a, b, self.algorithm, special_cases, name_cutoff]) continue combo_to_add = [ a, b, self.algorithm, special_cases, name_cutoff ] combo_size = sys.getsizeof(combo_to_add) combinations.append(combo_to_add) max_size_allocate = max(max_size_allocate, combo_size) estimated_size_to_allocate += combo_size pair_creator_pbar.update() pair_creator_pbar.close() self.logger.log(logging.DEBUG, "{} combinations".format(len(combinations))) self.logger.debug("{} special combinations".format( len(special_cases_combos))) self.logger.log( logging.DEBUG, "Max combination size {} bytes".format(max_size_allocate)) self.logger.log( logging.DEBUG, "Size of combinations {}".format( size(estimated_size_to_allocate, si))) printLogToConsole(self.console_log_level, "Removing pairs that are not valid", logging.INFO) self.logger.log(logging.INFO, "Removing pairs that are not valid") total_combinations = len(combinations) if self.cores == 1 or total_combinations < self.min_batch_len: if total_combinations < self.min_batch_len: self.logger.debug( "total combinations is less than min batch length({} < {})" .format(total_combinations, self.min_batch_len)) pbar = tqdm(total=total_combinations, file=sys.stdout) for combo in combinations: res = checkPair(combo) if res: tag, res = res if tag == 1: same[res[0]] = res else: different[res[0]] = res pbar.update() pbar.close() else: # Put the pairs into batches so they can be used in parallel, otherwise the overhead is too much batches = chunks(combinations, self.batch_size) batch_count = total_combinations // self.batch_size tmp_same = [] tmp_different = [] possible_errors = 0 if total_combinations % self.batch_size != 0: batch_count += 1 self.logger.debug("{} total batches".format(batch_count)) imap_results = [] t0 = time.time() with mp.Pool(self.cores) as Pool: try: imap_results = list( tqdm(Pool.imap_unordered(self._batchCheckPair, batches), total=batch_count, file=sys.stdout)) except Exception as e: print() self.logger.exception( "Exception raised when putting batches into pool", exc_info=e) raise e t1 = time.time() self.logger.debug("{:.2f} combos/second".format( total_combinations / (t1 - t0))) if not imap_results: printLogToConsole(self.console_log_level, "imap_results is empty", logging.ERROR) self.logger.log(logging.ERROR, "imap_results is empty") return [], [] for s, d in imap_results: tmp_same.extend(s) tmp_different.extend(d) # tmp_same = list(set(tmp_same)) # tmp_different = list(set(tmp_different)) printLogToConsole(self.console_log_level, "Combining results from pool", logging.INFO) self.logger.log(logging.INFO, "Combining results from pool") for i in tmp_same: if i[0] in same: possible_errors += 1 same[i[0]] = i for i in tmp_different: if i[0] in different: possible_errors += 1 different[i[0]] = i printLogToConsole(self.console_log_level, "{} overlapping keys".format(possible_errors), logging.DEBUG) self.logger.log(logging.DEBUG, "{} overlapping keys".format(possible_errors)) gc.collect() self.logger.log( logging.DEBUG, "Removed {} pairs".format(total_combinations - (len(same) + len(different)))) return [v for _, v in same.items()], [v for _, v in different.items()]