def measure_embeddings_correlations(embeddings_file1, embeddings_file2, sample_size=10):
        """

        :param embeddings_file1:
        :param embeddings_file2:
        :param sample_size: randomly select this many words from the embeddings, and compute their rankings.
        That way we can compute the expected rank correlation coefficient.
        :return: None
        """
        warnings.filterwarnings("ignore")
        #First step: read in embeddings files
        embeddings1 = kNearestNeighbors.read_in_embeddings(embeddings_file1)
        embeddings2 = kNearestNeighbors.read_in_embeddings(embeddings_file2)
        #Second step: prune out the uncommon words form both embeddings
        forbidden = set()
        for k, v in embeddings1.items():
            if k not in embeddings2:
                forbidden.add(k)
        for f in forbidden:
            del embeddings1[f]
        forbidden = set()
        for k, v in embeddings2.items():
            if k not in embeddings1:
                forbidden.add(k)
        for f in forbidden:
            del embeddings2[f]
        #Third step: sample
        words = embeddings1.keys()
        shuffle(words)
        words = words[0:sample_size]
        print words
        #Fourth step: build ranked lists
        correlations = dict()
        pvalues = dict()
        for word in words:
            score_dict = kNearestNeighbors._generate_scored_dict(embeddings1, word)
            l1 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True)
            score_dict = kNearestNeighbors._generate_scored_dict(embeddings2, word)
            l2 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True)
            results = EmbeddingsAnalyses._compute_rank_corr_coeff(l1, l2)
            correlations[word] = results[0]
            pvalues[word] = results[1]
        print 'word: corr. coeff'
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(correlations)
        print 'word: p_value'
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(pvalues)
        print 'Expected spearman\'s rank corr. coeff.: ',
        print np.mean(correlations.values())


# path = '/home/mayankkejriwal/Downloads/memex-cp4-october/'
# EmbeddingsAnalyses.measure_embeddings_correlations(path+'unigram-embeddings-gt.json', path+'unigram-embeddings-10000docs.json')
 def filter_r_lines(sample_file, embeddings_file, output_file,
                                preprocess_function=TextPreprocessors.TextPreprocessors._preprocess_tokens):
     """
     The goal of this function is to take a sample file, and to print to file all 'r' annotated lines
     such that at least some token from the last column has an embedding.
     :param sample_file:
     :param embeddings_file:
     :param output_file:
     :param preprocess_function:
     :return:
     """
     embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
     out = codecs.open(output_file, 'w', 'utf-8')
     with codecs.open(sample_file, 'r', 'utf-8') as f:
         for line in f:
             cols = re.split('\t',line)
             if cols[1] != 'r':
                 continue
             last_field = cols[-1][0:-1]  # take the last value, then strip out the newline.
             fields = re.split(',',last_field)
             if preprocess_function:
                 fields = set(preprocess_function(fields))
             if len(fields.intersection(embeddings)) > 0:
                 out.write(line)
     out.close()
def cluster_embeddings(doc_embeddings_file, cluster_file, output_file):
    """
    Uses a doc embedding file to generate cluster embeddings. The cluster_file is a jlines file where a cluster
    id refers to doc ids. We get the embedding by looking up the doc_embeddings; we ignore docs that don't have
    embeddings. We do a sum and normalize.
    :param doc_embeddings_file:
    :param cluster_file:
    :return:
    """
    doc_embeddings_dict = kNearestNeighbors.read_in_embeddings(
        doc_embeddings_file)
    cluster_dict = dict()
    with codecs.open(cluster_file, 'r', 'utf-8') as f:
        for line in f:
            obj = json.loads(line)
            for k, v in obj.items():
                cluster_dict[k] = v

    out = codecs.open(output_file, 'w', 'utf-8')
    for k, v in cluster_dict.items():
        list_of_vecs = list()
        for doc in v:
            if doc in doc_embeddings_dict:
                list_of_vecs.append(doc_embeddings_dict[doc])
            else:
                print 'doc not in doc embedding ',
                print doc
        tmp = dict()
        tmp[k] = VectorUtils.normalize_vector(np.sum(list_of_vecs,
                                                     axis=0)).tolist()
        json.dump(tmp, out)
        out.write('\n')
    out.close()
def sum_and_normalize(embeddings_file, tokens_file, output_file):
    """
    Doc embeddings are computed by doing summing all tokens that exist in the embeddings file, following
    which we do an l2 normalization
    :param embeddings_file:
    :param tokens_file:
    :param output_file:
    :return:
    """
    embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file)
    out = codecs.open(output_file, 'w', 'utf-8')
    count = 0
    with codecs.open(tokens_file, 'r', 'utf-8') as f:
        for line in f:
            count += 1
            if count % 5 == 0:
                print 'in document...',
                print count
            obj = json.loads(line)
            list_of_vectors = list()
            flag = False
            tmp = dict()
            for token in obj.values()[0]:
                if token not in embeddings_dict:
                    continue
                else:
                    list_of_vectors.append(embeddings_dict[token])
                    flag = True
            if flag:
                tmp[obj.keys()[0]] = VectorUtils.normalize_vector(
                    np.sum(list_of_vectors, axis=0)).tolist()
                json.dump(tmp, out)
                out.write('\n')
        out.close()
def sum_and_normalize(embeddings_file, tokens_file, output_file):
    """
    Doc embeddings are computed by doing summing all tokens that exist in the embeddings file, following
    which we do an l2 normalization
    :param embeddings_file:
    :param tokens_file:
    :param output_file:
    :return:
    """
    embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file)
    out = codecs.open(output_file, 'w', 'utf-8')
    count = 0
    with codecs.open(tokens_file, 'r', 'utf-8') as f:
        for line in f:
            count += 1
            if count%5==0:
                print 'in document...',
                print count
            obj = json.loads(line)
            list_of_vectors = list()
            flag = False
            tmp = dict()
            for token in obj.values()[0]:
                if token not in embeddings_dict:
                    continue
                else:
                    list_of_vectors.append(embeddings_dict[token])
                    flag = True
            if flag:
                tmp[obj.keys()[0]] = VectorUtils.normalize_vector(np.sum(list_of_vectors, axis=0)).tolist()
                json.dump(tmp, out)
                out.write('\n')
        out.close()
def cluster_embeddings(doc_embeddings_file, cluster_file, output_file):
    """
    Uses a doc embedding file to generate cluster embeddings. The cluster_file is a jlines file where a cluster
    id refers to doc ids. We get the embedding by looking up the doc_embeddings; we ignore docs that don't have
    embeddings. We do a sum and normalize.
    :param doc_embeddings_file:
    :param cluster_file:
    :return:
    """
    doc_embeddings_dict = kNearestNeighbors.read_in_embeddings(doc_embeddings_file)
    cluster_dict = dict()
    with codecs.open(cluster_file, 'r', 'utf-8') as f:
        for line in f:
            obj = json.loads(line)
            for k, v in obj.items():
                cluster_dict[k] = v

    out = codecs.open(output_file, 'w', 'utf-8')
    for k, v in cluster_dict.items():
        list_of_vecs = list()
        for doc in v:
            if doc in doc_embeddings_dict:
                list_of_vecs.append(doc_embeddings_dict[doc])
            else:
                print 'doc not in doc embedding ',
                print doc
        tmp = dict()
        tmp[k] = VectorUtils.normalize_vector(np.sum(list_of_vecs, axis=0)).tolist()
        json.dump(tmp, out)
        out.write('\n')
    out.close()
 def filter_lines_with_embeddings(sample_file,
                                  embeddings_file,
                                  output_file,
                                  preprocess_function=TextPreprocessors.
                                  TextPreprocessors._preprocess_tokens):
     """
     The goal of this function is to take a sample file, and to print to file all lines
     such that at least some token from the last column has an embedding.
     :param sample_file:
     :param embeddings_file:
     :param output_file:
     :param preprocess_function:
     :return:
     """
     embeddings = set(
         kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
     out = codecs.open(output_file, 'w', 'utf-8')
     with codecs.open(sample_file, 'r', 'utf-8') as f:
         for line in f:
             cols = re.split('\t', line)
             last_field = cols[-1][
                 0:-1]  # take the last value, then strip out the newline.
             fields = re.split(',', last_field)
             if preprocess_function:
                 fields = set(preprocess_function(fields))
             if len(fields.intersection(embeddings)) > 0:
                 out.write(line)
     out.close()
def idf_weighted_embedding(embeddings_file, tokens_file, idf_file,
                           output_file):
    """
    Doc embeddings are computed by doing a weighted idf sum of tokens that exist in the embeddings file, following
    which we do an l2 normalization
    :param embeddings_file:
    :param tokens_file:
    :param idf_file:
    :param output_file:
    :return: None
    """
    embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file)
    idf_dict = TextAnalyses.TextAnalyses.read_in_and_prune_idf(
        idf_file, lower_prune_ratio=0.0, upper_prune_ratio=1.0)
    out = codecs.open(output_file, 'w', 'utf-8')

    count = 0
    with codecs.open(tokens_file, 'r', 'utf-8') as f:
        for line in f:
            count += 1

            tmp = dict()
            # weights = list()
            # vectors = list()
            total_weights = 0.0
            vector = list()
            obj = json.loads(line)
            for token in obj.values()[0]:
                if token not in embeddings_dict:
                    continue
                elif token in embeddings_dict and token not in idf_dict:
                    continue
                else:
                    # print idf_dict[token]
                    weight = float(idf_dict[token])
                    total_weights += weight
                    vector1 = [
                        element * weight
                        for element in list(embeddings_dict[token])
                    ]
                    if not vector:
                        vector = vector1
                    else:
                        vector = list(np.sum([vector, vector1], axis=0))
            if total_weights == 0.0:
                print 'no doc embedding. Skipping document...'
                continue
            tmp[obj.keys()[0]] = list(VectorUtils.normalize_vector(vector))
            print count
            json.dump(tmp, out)
            out.write('\n')
    out.close()
    def filter_dict_terms_with_embeddings(dictionary_file, embeddings_file, output_file):
        """

        :param dictionary_file:
        :param embeddings_file:
        :param output_file:
        :return:
        """
        input = codecs.open(dictionary_file, 'r', 'utf-8')
        dictionary_set = set(json.load(input))
        embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
        out = codecs.open(output_file, 'w', 'utf-8')
        for m in dictionary_set.intersection(embeddings):
            out.write(m)
            out.write('\n')
        out.close()
def idf_weighted_embedding(embeddings_file, tokens_file, idf_file, output_file):
    """
    Doc embeddings are computed by doing a weighted idf sum of tokens that exist in the embeddings file, following
    which we do an l2 normalization
    :param embeddings_file:
    :param tokens_file:
    :param idf_file:
    :param output_file:
    :return: None
    """
    embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file)
    idf_dict = TextAnalyses.TextAnalyses.read_in_and_prune_idf(idf_file, lower_prune_ratio=0.0, upper_prune_ratio=1.0)
    out = codecs.open(output_file, 'w', 'utf-8')

    count = 0
    with codecs.open(tokens_file, 'r', 'utf-8') as f:
        for line in f:
            count += 1

            tmp = dict()
            # weights = list()
            # vectors = list()
            total_weights = 0.0
            vector = list()
            obj = json.loads(line)
            for token in obj.values()[0]:
                if token not in embeddings_dict:
                    continue
                elif token in embeddings_dict and token not in idf_dict:
                    continue
                else:
                    # print idf_dict[token]
                    weight = float(idf_dict[token])
                    total_weights += weight
                    vector1 = [element * weight for element in list(embeddings_dict[token])]
                    if not vector:
                        vector = vector1
                    else:
                        vector = list(np.sum([vector, vector1], axis=0))
            if total_weights == 0.0:
                print 'no doc embedding. Skipping document...'
                continue
            tmp[obj.keys()[0]] = list(VectorUtils.normalize_vector(vector))
            print count
            json.dump(tmp, out)
            out.write('\n')
    out.close()
    def _check_embeddings_coverage(sample_file,
                                   embeddings_file,
                                   preprocess_function=TextPreprocessors.
                                   TextPreprocessors._preprocess_tokens):
        """
        Designed for any sample file. Will first read in all tokens (using space as separator) from the first
        column of sample_file, and if preprocess_function is not None, will preprocess the token list.
        Next, we'll read in the embeddings file and compute token coverage.

        Note that if multiple tokens appear in some line, it will be as if they are differnt lines. This is because
        each token would be a separate 'instance' in any ML algorithm we use.
        :param sample_file:
         :param embeddings_file:
         :param preprocess_function: a function
        :return: None
        """
        list_of_r_tokens = list()
        list_of_nr_tokens = list()
        with codecs.open(sample_file, 'r', 'utf-8') as f:
            for line in f:
                cols = re.split('\t', line)
                first_field = cols[0]
                fields = re.split(' ', first_field)
                if preprocess_function:
                    fields = (preprocess_function(fields))
                if cols[1] == 'r':
                    list_of_r_tokens += fields
                elif cols[1] == 'nr\n':
                    list_of_nr_tokens += fields
                else:
                    print 'Error in line! Run sample validation code'
        embeddings = set(
            kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
        covered_r = 0
        covered_nr = 0
        for r in list_of_r_tokens:
            if r in embeddings:
                covered_r += 1
        for nr in list_of_nr_tokens:
            if nr in embeddings:
                covered_nr += 1
        print 'Covered r is ' + str(covered_r) + ' out of a total of ' + str(
            len(list_of_r_tokens)) + ' tokens'
        print 'Covered nr is ' + str(covered_nr) + ' out of a total of ' + str(
            len(list_of_nr_tokens)) + ' tokens'
    def filter_dict_terms_with_embeddings(dictionary_file, embeddings_file,
                                          output_file):
        """

        :param dictionary_file:
        :param embeddings_file:
        :param output_file:
        :return:
        """
        input = codecs.open(dictionary_file, 'r', 'utf-8')
        dictionary_set = set(json.load(input))
        embeddings = set(
            kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
        out = codecs.open(output_file, 'w', 'utf-8')
        for m in dictionary_set.intersection(embeddings):
            out.write(m)
            out.write('\n')
        out.close()
    def _check_embeddings_coverage(sample_file, embeddings_file,
                                   preprocess_function=TextPreprocessors.TextPreprocessors._preprocess_tokens):
        """
        Designed for any sample file. Will first read in all tokens (using space as separator) from the first
        column of sample_file, and if preprocess_function is not None, will preprocess the token list.
        Next, we'll read in the embeddings file and compute token coverage.

        Note that if multiple tokens appear in some line, it will be as if they are differnt lines. This is because
        each token would be a separate 'instance' in any ML algorithm we use.
        :param sample_file:
         :param embeddings_file:
         :param preprocess_function: a function
        :return: None
        """
        list_of_r_tokens = list()
        list_of_nr_tokens = list()
        with codecs.open(sample_file, 'r', 'utf-8') as f:
            for line in f:
                cols = re.split('\t',line)
                first_field = cols[0]
                fields = re.split(' ',first_field)
                if preprocess_function:
                    fields = (preprocess_function(fields))
                if cols[1] == 'r':
                    list_of_r_tokens += fields
                elif cols[1] == 'nr\n':
                    list_of_nr_tokens += fields
                else:
                    print 'Error in line! Run sample validation code'
        embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys())
        covered_r = 0
        covered_nr = 0
        for r in list_of_r_tokens:
            if r in embeddings:
                covered_r += 1
        for nr in list_of_nr_tokens:
            if nr in embeddings:
                covered_nr += 1
        print 'Covered r is '+str(covered_r)+' out of a total of '+str(len(list_of_r_tokens))+' tokens'
        print 'Covered nr is '+str(covered_nr)+' out of a total of '+str(len(list_of_nr_tokens))+' tokens'
    def _build_vector_set_for_attribute(embeddings_file, ground_truth_file, attribute):
        """

        :param embeddings_file:
        :param ground_truth_file:
        :param attribute:
        :return: A dictionary with keys that are values
        """
        ground_truth_list = FieldAnalyses.read_in_ground_truth_file(ground_truth_file)
        embeddings = kNearestNeighbors.read_in_embeddings(embeddings_file)
        attribute_vectors = dict()
        for obj in ground_truth_list:
            if attribute in obj:
                tokens_list = TextPreprocessors.TextPreprocessors.tokenize_field(obj, attribute)
                processed_tokens_list = TextPreprocessors.TextPreprocessors.preprocess_tokens(tokens_list)
                for token in processed_tokens_list:
                    if token in embeddings:
                        attribute_vectors[token] = embeddings[token]
                    # else:
                    #     print 'token in file, but not in embeddings: ',
                    #     print token
        # print len(attribute_vectors)
        return attribute_vectors
    def measure_embeddings_correlations(embeddings_file1,
                                        embeddings_file2,
                                        sample_size=10):
        """

        :param embeddings_file1:
        :param embeddings_file2:
        :param sample_size: randomly select this many words from the embeddings, and compute their rankings.
        That way we can compute the expected rank correlation coefficient.
        :return: None
        """
        warnings.filterwarnings("ignore")
        #First step: read in embeddings files
        embeddings1 = kNearestNeighbors.read_in_embeddings(embeddings_file1)
        embeddings2 = kNearestNeighbors.read_in_embeddings(embeddings_file2)
        #Second step: prune out the uncommon words form both embeddings
        forbidden = set()
        for k, v in embeddings1.items():
            if k not in embeddings2:
                forbidden.add(k)
        for f in forbidden:
            del embeddings1[f]
        forbidden = set()
        for k, v in embeddings2.items():
            if k not in embeddings1:
                forbidden.add(k)
        for f in forbidden:
            del embeddings2[f]
        #Third step: sample
        words = embeddings1.keys()
        shuffle(words)
        words = words[0:sample_size]
        print words
        #Fourth step: build ranked lists
        correlations = dict()
        pvalues = dict()
        for word in words:
            score_dict = kNearestNeighbors._generate_scored_dict(
                embeddings1, word)
            l1 = kNearestNeighbors._extract_top_k(score_dict,
                                                  k=0,
                                                  disable_k=True)
            score_dict = kNearestNeighbors._generate_scored_dict(
                embeddings2, word)
            l2 = kNearestNeighbors._extract_top_k(score_dict,
                                                  k=0,
                                                  disable_k=True)
            results = EmbeddingsAnalyses._compute_rank_corr_coeff(l1, l2)
            correlations[word] = results[0]
            pvalues[word] = results[1]
        print 'word: corr. coeff'
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(correlations)
        print 'word: p_value'
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(pvalues)
        print 'Expected spearman\'s rank corr. coeff.: ',
        print np.mean(correlations.values())


# path = '/home/mayankkejriwal/Downloads/memex-cp4-october/'
# EmbeddingsAnalyses.measure_embeddings_correlations(path+'unigram-embeddings-gt.json', path+'unigram-embeddings-10000docs.json')