def bats_names_pairs(dir="BATS_3.0"):
    names = []
    pairs_sets = []

    for d in os.listdir(dir):
        if d != 'metadata.json':
            for f in os.listdir(os.path.join(dir, str(d))):
                names.append(str(f)[:-4])
                pairs_sets.append(set())
                with utils.open_file(os.path.join(dir, str(d), str(f))) as fin:
                    for line_no, line in enumerate(fin):
                        line = utils.to_unicode(line)
                        a, b = [word.lower() for word in line.split()]
                        list_b = b.split('/')
                        if list_b[0] != a:  #Keeping only the first analogy pair
                            pairs_sets[-1].add((a, list_b[0]))

    return (names, pairs_sets)
Пример #2
0
        def __init__(self, input, transposed=True):
            """

            Parameters
            ----------
            input : {str, file-like object}
                Path to input file in MM format or a file-like object that supports `seek()`
                (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).

            transposed : bool, optional
                if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.

            """
            logger.info("initializing corpus reader from %s", input)
            self.input, self.transposed = input, transposed
            with utils.open_file(self.input) as lines:
                try:
                    header = utils.to_unicode(next(lines)).strip()
                    if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
                        raise ValueError(
                            "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                            (self.input, header)
                        )
                except StopIteration:
                    pass

                self.num_docs = self.num_terms = self.num_nnz = 0
                for lineno, line in enumerate(lines):
                    line = utils.to_unicode(line)
                    if not line.startswith('%'):
                        self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
                        if not self.transposed:
                            self.num_docs, self.num_terms = self.num_terms, self.num_docs
                        break

            logger.info(
                "accepted corpus with %i documents, %i features, %i non-zero entries",
                self.num_docs, self.num_terms, self.num_nnz
            )
Пример #3
0
 def test_open_file_existent_file_object(self):
     number_of_lines_in_file = 30
     file_obj = open(datapath('testcorpus.mm'))
     with utils.open_file(file_obj) as infile:
         self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
Пример #4
0
 def test_open_file_non_existent_file(self):
     with self.assertRaises(Exception):
         with utils.open_file('non_existent_file.txt'):
             pass
Пример #5
0
 def test_open_file_non_existent_file_object(self):
     file_obj = None
     with self.assertRaises(Exception):
         with utils.open_file(file_obj):
             pass
Пример #6
0
 def test_open_file_non_existent_file(self):
     with self.assertRaises(Exception):
         with utils.open_file('non_existent_file.txt'):
             pass
Пример #7
0
 def test_open_file_existent_file(self):
     number_of_lines_in_file = 30
     with utils.open_file(datapath('testcorpus.mm')) as infile:
         self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
Пример #8
0
 def test_open_file_non_existent_file_object(self):
     file_obj = None
     with self.assertRaises(Exception):
         with utils.open_file(file_obj):
             pass
Пример #9
0
def evaluate_word_analogies_bats(model,
                                 directory,
                                 restrict_vocab=300000,
                                 case_insensitive=True,
                                 dummy4unknown=False):
    logger = logging.getLogger(__name__)
    print("# Computing analogy scores for category type: ", str(directory))

    ok_vocab = [(w, model.vocab[w]) for w in model.index2word[:restrict_vocab]]
    ok_vocab = {w.upper(): v
                for w, v in reversed(ok_vocab)
                } if case_insensitive else dict(ok_vocab)
    oov = 0
    # logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies)
    sections, section = [], None
    quadruplets_no = 0

    directions_names_bats = []
    pairs_sets = []

    scores_bats = []  #dict()
    scores_bats_vanilla = []  #dict()

    for f in os.listdir('BATS_3.0/' + str(directory)):  #..
        directions_names_bats.append(str(f)[:-4])
        pairs_sets.append(set())
        with utils.open_file('BATS_3.0/' + str(directory) + '/' +
                             str(f)) as fin:
            for line_no, line in enumerate(fin):
                line = utils.to_unicode(line)
                a, b = [word.lower() for word in line.split()]
                list_b = b.split('/')
                if list_b[0] != a:
                    pairs_sets[-1].add((a.upper(), list_b[0].upper()))

    for i in range(len(directions_names_bats)):
        if section:
            # store the last section, too
            sections.append(section)
            # model._log_evaluate_word_analogies(section)
            correct, incorrect = len(section['correct']), len(
                section['incorrect'])
            if correct + incorrect > 0:
                score = correct / (correct + incorrect)
                logger.info("%s: %.1f%% (%i/%i)", section['section'],
                            100.0 * score, correct, correct + incorrect)
                scores_bats.append(
                    [section['section'], score, correct, correct + incorrect])
            else:
                print('No score for ', section['section'])
            correct, incorrect = len(section['correct_vanilla']), len(
                section['incorrect_vanilla'])
            if correct + incorrect > 0:
                score = correct / (correct + incorrect)
                logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'],
                            100.0 * score, correct, correct + incorrect)
                scores_bats_vanilla.append(
                    [section['section'], score, correct, correct + incorrect])
            total_section = len(section['correct_vanilla']) + len(
                section['incorrect_vanilla'])
            if total_section > 0:
                logger.info(
                    'Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)',
                    section['n_a'], section['n_a'] / total_section,
                    section['n_a*'], section['n_a*'] / total_section,
                    section['n_b'], section['n_b'] / total_section)

        section = {
            'section': directions_names_bats[i],
            'correct': [],
            'incorrect': [],
            'correct_vanilla': [],
            'incorrect_vanilla': [],
            'n_a': 0,
            'n_a*': 0,
            'n_b': 0,
            'cd': [],
            'badc': [],
            'bac': [],
            'n/cba': [],
            'n/c': [],
            'n/d': []
        }

        tuples = pairs_sets[i]
        for t1 in tuples:
            for t2 in tuples:
                a, b = t1
                c, expected = t2
                if a != c:
                    quadruplets_no += 1
                    if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                        oov += 1
                        # if dummy4unknown:
                        #    logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip())
                        #    section['incorrect'].append((a, b, c, expected))
                        # else:
                        #    logger.debug("Skipping line with OOV words")
                        continue
                    original_vocab = model.vocab
                    model.vocab = ok_vocab

                    predicted = None
                    ignore = {a, b, c}  # input words to be ignored
                    ignore_bool = False
                    positive = [b, c]
                    negative = [a]
                    # find the most likely prediction using 3CosAdd (vector offset) method
                    # TODO: implement 3CosMul and set-based methods for solving analogies
                    sims = most_similar(model,
                                        positive=positive,
                                        negative=negative,
                                        topn=5,
                                        restrict_vocab=restrict_vocab,
                                        ignore=ignore_bool)

                    model.vocab = original_vocab

                    # predicted = sims[0][0].upper() if case_insensitive else sims[0][0]
                    for element in sims:
                        predicted = element[0].upper(
                        ) if case_insensitive else element[0]

                        if predicted in ok_vocab and predicted not in ignore:
                            break
                    for element in sims:
                        predicted_ignore = element[0].upper(
                        ) if case_insensitive else element[0]

                        if predicted_ignore in ok_vocab:
                            break

                    if predicted == expected:
                        section['correct'].append((a, b, c, expected))
                    else:
                        section['incorrect'].append((a, b, c, expected))

                    if predicted_ignore == expected:
                        section['correct_vanilla'].append((a, b, c, expected))
                    else:
                        section['incorrect_vanilla'].append(
                            (a, b, c, expected))
                    if predicted_ignore == a:
                        section['n_a'] += 1
                    if predicted_ignore == b:
                        section['n_a*'] += 1
                    if predicted_ignore == c:
                        section['n_b'] += 1

    if section:
        # store the last section, too
        sections.append(section)
        # model._log_evaluate_word_analogies(section)
        correct, incorrect = len(section['correct']), len(section['incorrect'])
        if correct + incorrect > 0:
            score = correct / (correct + incorrect)
            logger.info("%s: %.1f%% (%i/%i)", section['section'],
                        100.0 * score, correct, correct + incorrect)
            scores_bats.append(
                [section['section'], score, correct, correct + incorrect])
        else:
            print('No score for ', section['section'])
        correct, incorrect = len(section['correct_vanilla']), len(
            section['incorrect_vanilla'])
        if correct + incorrect > 0:
            score = correct / (correct + incorrect)
            logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'],
                        100.0 * score, correct, correct + incorrect)
            scores_bats_vanilla.append(
                [section['section'], score, correct, correct + incorrect])

        total_section = len(section['correct_vanilla']) + len(
            section['incorrect_vanilla'])
        if total_section > 0:
            logger.info(
                'Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)',
                section['n_a'], section['n_a'] / total_section,
                section['n_a*'], section['n_a*'] / total_section,
                section['n_b'], section['n_b'] / total_section)

    total = {
        'section':
        'Total accuracy',
        'correct':
        list(chain.from_iterable(s['correct'] for s in sections)),
        'incorrect':
        list(chain.from_iterable(s['incorrect'] for s in sections)),
        'correct_vanilla':
        list(chain.from_iterable(s['correct_vanilla'] for s in sections)),
        'incorrect_vanilla':
        list(chain.from_iterable(s['incorrect_vanilla'] for s in sections)),
    }

    oov_ratio = float(oov) / quadruplets_no * 100
    logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio)
    if not dummy4unknown:
        logger.info(
            'NB: analogies containing OOV words were skipped from evaluation! '
            'To change this behavior, use "dummy4unknown=True"')
    # analogies_score = model._log_evaluate_word_analogies(total)
    correct, incorrect = len(total['correct']), len(total['incorrect'])
    # print(total)
    if correct + incorrect > 0:
        score = correct / (correct + incorrect)
        logger.info("%s: %.1f%% (%i/%i)", total['section'], 100.0 * score,
                    correct, correct + incorrect)
        total_score = [
            "# Total " + str(directory), score, correct, correct + incorrect
        ]
        analogies_score = score
    correct_vanilla, incorrect_vanilla = len(total['correct_vanilla']), len(
        total['incorrect_vanilla'])
    # print(total)
    if correct_vanilla + incorrect_vanilla > 0:
        score = correct_vanilla / (correct_vanilla + incorrect_vanilla)
        logger.info("%s: %.1f%% (%i/%i) VANILLA", total['section'],
                    100.0 * score, correct_vanilla,
                    correct_vanilla + incorrect_vanilla)
        total_score_vanilla = [
            "# Total " + str(directory), score, correct_vanilla,
            correct_vanilla + incorrect_vanilla
        ]
        analogies_score = score

    sections.append(total)
    bats_scores = [
        total_score, total_score_vanilla, scores_bats, scores_bats_vanilla
    ]
    # Return the overall score and the full lists of correct and incorrect analogies
    return bats_scores  #[analogies_score, sections, bats_scores]