def bats_names_pairs(dir="BATS_3.0"): names = [] pairs_sets = [] for d in os.listdir(dir): if d != 'metadata.json': for f in os.listdir(os.path.join(dir, str(d))): names.append(str(f)[:-4]) pairs_sets.append(set()) with utils.open_file(os.path.join(dir, str(d), str(f))) as fin: for line_no, line in enumerate(fin): line = utils.to_unicode(line) a, b = [word.lower() for word in line.split()] list_b = b.split('/') if list_b[0] != a: #Keeping only the first analogy pair pairs_sets[-1].add((a, list_b[0])) return (names, pairs_sets)
def __init__(self, input, transposed=True): """ Parameters ---------- input : {str, file-like object} Path to input file in MM format or a file-like object that supports `seek()` (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). transposed : bool, optional if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. """ logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.open_file(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header) ) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz )
def test_open_file_existent_file_object(self): number_of_lines_in_file = 30 file_obj = open(datapath('testcorpus.mm')) with utils.open_file(file_obj) as infile: self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
def test_open_file_non_existent_file(self): with self.assertRaises(Exception): with utils.open_file('non_existent_file.txt'): pass
def test_open_file_non_existent_file_object(self): file_obj = None with self.assertRaises(Exception): with utils.open_file(file_obj): pass
def test_open_file_non_existent_file(self): with self.assertRaises(Exception): with utils.open_file('non_existent_file.txt'): pass
def test_open_file_existent_file(self): number_of_lines_in_file = 30 with utils.open_file(datapath('testcorpus.mm')) as infile: self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
def test_open_file_non_existent_file_object(self): file_obj = None with self.assertRaises(Exception): with utils.open_file(file_obj): pass
def evaluate_word_analogies_bats(model, directory, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): logger = logging.getLogger(__name__) print("# Computing analogy scores for category type: ", str(directory)) ok_vocab = [(w, model.vocab[w]) for w in model.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab) } if case_insensitive else dict(ok_vocab) oov = 0 # logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies) sections, section = [], None quadruplets_no = 0 directions_names_bats = [] pairs_sets = [] scores_bats = [] #dict() scores_bats_vanilla = [] #dict() for f in os.listdir('BATS_3.0/' + str(directory)): #.. directions_names_bats.append(str(f)[:-4]) pairs_sets.append(set()) with utils.open_file('BATS_3.0/' + str(directory) + '/' + str(f)) as fin: for line_no, line in enumerate(fin): line = utils.to_unicode(line) a, b = [word.lower() for word in line.split()] list_b = b.split('/') if list_b[0] != a: pairs_sets[-1].add((a.upper(), list_b[0].upper())) for i in range(len(directions_names_bats)): if section: # store the last section, too sections.append(section) # model._log_evaluate_word_analogies(section) correct, incorrect = len(section['correct']), len( section['incorrect']) if correct + incorrect > 0: score = correct / (correct + incorrect) logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) scores_bats.append( [section['section'], score, correct, correct + incorrect]) else: print('No score for ', section['section']) correct, incorrect = len(section['correct_vanilla']), len( section['incorrect_vanilla']) if correct + incorrect > 0: score = correct / (correct + incorrect) logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'], 100.0 * score, correct, correct + incorrect) scores_bats_vanilla.append( [section['section'], score, correct, correct + incorrect]) total_section = len(section['correct_vanilla']) + len( section['incorrect_vanilla']) if total_section > 0: logger.info( 'Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)', section['n_a'], section['n_a'] / total_section, section['n_a*'], section['n_a*'] / total_section, section['n_b'], section['n_b'] / total_section) section = { 'section': directions_names_bats[i], 'correct': [], 'incorrect': [], 'correct_vanilla': [], 'incorrect_vanilla': [], 'n_a': 0, 'n_a*': 0, 'n_b': 0, 'cd': [], 'badc': [], 'bac': [], 'n/cba': [], 'n/c': [], 'n/d': [] } tuples = pairs_sets[i] for t1 in tuples: for t2 in tuples: a, b = t1 c, expected = t2 if a != c: quadruplets_no += 1 if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: oov += 1 # if dummy4unknown: # logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip()) # section['incorrect'].append((a, b, c, expected)) # else: # logger.debug("Skipping line with OOV words") continue original_vocab = model.vocab model.vocab = ok_vocab predicted = None ignore = {a, b, c} # input words to be ignored ignore_bool = False positive = [b, c] negative = [a] # find the most likely prediction using 3CosAdd (vector offset) method # TODO: implement 3CosMul and set-based methods for solving analogies sims = most_similar(model, positive=positive, negative=negative, topn=5, restrict_vocab=restrict_vocab, ignore=ignore_bool) model.vocab = original_vocab # predicted = sims[0][0].upper() if case_insensitive else sims[0][0] for element in sims: predicted = element[0].upper( ) if case_insensitive else element[0] if predicted in ok_vocab and predicted not in ignore: break for element in sims: predicted_ignore = element[0].upper( ) if case_insensitive else element[0] if predicted_ignore in ok_vocab: break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if predicted_ignore == expected: section['correct_vanilla'].append((a, b, c, expected)) else: section['incorrect_vanilla'].append( (a, b, c, expected)) if predicted_ignore == a: section['n_a'] += 1 if predicted_ignore == b: section['n_a*'] += 1 if predicted_ignore == c: section['n_b'] += 1 if section: # store the last section, too sections.append(section) # model._log_evaluate_word_analogies(section) correct, incorrect = len(section['correct']), len(section['incorrect']) if correct + incorrect > 0: score = correct / (correct + incorrect) logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) scores_bats.append( [section['section'], score, correct, correct + incorrect]) else: print('No score for ', section['section']) correct, incorrect = len(section['correct_vanilla']), len( section['incorrect_vanilla']) if correct + incorrect > 0: score = correct / (correct + incorrect) logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'], 100.0 * score, correct, correct + incorrect) scores_bats_vanilla.append( [section['section'], score, correct, correct + incorrect]) total_section = len(section['correct_vanilla']) + len( section['incorrect_vanilla']) if total_section > 0: logger.info( 'Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)', section['n_a'], section['n_a'] / total_section, section['n_a*'], section['n_a*'] / total_section, section['n_b'], section['n_b'] / total_section) total = { 'section': 'Total accuracy', 'correct': list(chain.from_iterable(s['correct'] for s in sections)), 'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)), 'correct_vanilla': list(chain.from_iterable(s['correct_vanilla'] for s in sections)), 'incorrect_vanilla': list(chain.from_iterable(s['incorrect_vanilla'] for s in sections)), } oov_ratio = float(oov) / quadruplets_no * 100 logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio) if not dummy4unknown: logger.info( 'NB: analogies containing OOV words were skipped from evaluation! ' 'To change this behavior, use "dummy4unknown=True"') # analogies_score = model._log_evaluate_word_analogies(total) correct, incorrect = len(total['correct']), len(total['incorrect']) # print(total) if correct + incorrect > 0: score = correct / (correct + incorrect) logger.info("%s: %.1f%% (%i/%i)", total['section'], 100.0 * score, correct, correct + incorrect) total_score = [ "# Total " + str(directory), score, correct, correct + incorrect ] analogies_score = score correct_vanilla, incorrect_vanilla = len(total['correct_vanilla']), len( total['incorrect_vanilla']) # print(total) if correct_vanilla + incorrect_vanilla > 0: score = correct_vanilla / (correct_vanilla + incorrect_vanilla) logger.info("%s: %.1f%% (%i/%i) VANILLA", total['section'], 100.0 * score, correct_vanilla, correct_vanilla + incorrect_vanilla) total_score_vanilla = [ "# Total " + str(directory), score, correct_vanilla, correct_vanilla + incorrect_vanilla ] analogies_score = score sections.append(total) bats_scores = [ total_score, total_score_vanilla, scores_bats, scores_bats_vanilla ] # Return the overall score and the full lists of correct and incorrect analogies return bats_scores #[analogies_score, sections, bats_scores]