예제 #1
0
def compute_training_pairs_w_queries_variations(fold_idx, coll, n_iter_per_query, gt_file, dbn, qbn, ftt_model, iwi,
                                                wi):
    model_name = 'qn_rd_nrd_pairs_w2v_gk_' + str(fold_idx) + '_' + str(coll) + '_' + str(n_iter_per_query)
    if not os.path.isfile(model_name):
        rd_b_qry = {}
        nrd_by_qry = {}
        for line in open(gt_file):
            data = line.split()
            qname = data[0].strip()
            dname = data[2].strip()
            if dname not in dbn.keys():
                continue
            rj = int(data[3].strip())
            if qname not in rd_b_qry.keys():
                rd_b_qry[qname] = []
                nrd_by_qry[qname] = []

            if rj > 0:
                rd_b_qry[qname].append(dname)
            else:
                nrd_by_qry[qname].append(dname)
        test_q_names = list(qbn.keys())
        np.random.shuffle(test_q_names)

        qn_rd_nrd_pairs = []
        for qn in test_q_names:
            if qn not in rd_b_qry.keys():
                continue

            # add training examples with original query:
            encoded_q = qbn[qn]
            tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True)
            tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True)
            for i in range(n_iter_per_query):
                qn_rd_nrd_pairs.append((encoded_q, dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]]))
            print('original query: ' + ' '.join([iwi[w] for w in encoded_q]))
            # add extra training examples
            for i in range(len(encoded_q)):
                encoded_q_variation = encoded_q
                curr_q_word = iwi[encoded_q[i]]
                similar_words = get_synonyms(curr_q_word, ftt_model)
                for sw in similar_words:
                    sw = util.stem(sw)
                    if sw in wi.keys() and curr_q_word != sw:
                        print('word = ' + curr_q_word + ', substitute = ' + sw)
                        encoded_q_variation[i] = wi[sw]
                        print('alternative query: ' + ' '.join([iwi[w] for w in encoded_q_variation]))
                        tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True)
                        tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True)
                        for j in range(n_iter_per_query):
                            qn_rd_nrd_pairs.append((encoded_q_variation, dbn[tmp_rdocs[j]], dbn[tmp_nrdocs[j]]))

        np.random.shuffle(qn_rd_nrd_pairs)
        util.save_model(qn_rd_nrd_pairs, model_name)
    else:
        qn_rd_nrd_pairs = util.load_model(model_name)
    return qn_rd_nrd_pairs
예제 #2
0
def encode_collection_with_stemming(text_by_name_p,
                                    word_dict_path,
                                    w2v_model_path,
                                    encoded_out_folder,
                                    wi=None,
                                    word_embeddings_matrix=None):
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(text_by_name_p)):
        fp = os.path.join(text_by_name_p, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())

    # initialize embeddings matrix
    if word_embeddings_matrix is None:
        # read and adapt word index
        if wi is None:
            wi = {}
            wids_to_merge = {}
            for line in tqdm(open(word_dict_path)):
                data = line.split()
                word_stemmed = util.stem(data[0].strip())
                wid = int(data[1].strip())
                if word_stemmed not in wi.keys():
                    wi[word_stemmed] = len(wi)
                    wids_to_merge[word_stemmed] = [wid]
                else:
                    wids_to_merge[word_stemmed].append(wid)
        we_size = 50
        word_embeddings_matrix = np.float32(
            np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size]))
        padding_value = np.zeros(we_size)
        word_embeddings_matrix[word_embeddings_matrix.shape[0] -
                               1] = padding_value
        w2v_model = load_w2v_we(w2v_model_path)
        for k, v in wi.items():
            we = np.zeros(we_size)
            summed_something = False
            for wid in wids_to_merge[k]:
                if wid in w2v_model.keys():
                    we = np.sum((we, w2v_model[wid]), axis=0)
                    summed_something = True
            if summed_something:
                we = we / np.linalg.norm(we)  # normalize new word embedding
                word_embeddings_matrix[v] = we

    encoded_docs_by_name = {}
    sw = load_indri_stopwords()
    print('encoding data')
    for dn, dc in tqdm(text_by_name.items()):
        td = util.tokenize(dc, stemming=True, stoplist=sw)
        encoded_doc = [wi[w] for w in td if w in wi.keys()]
        util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name, wi, word_embeddings_matrix
예제 #3
0
def compute_docs_to_rerank_by_query(queries_names, qbn, dbn, iwi, ii, fasttext_vec_model):
    docs_to_rerank_by_qry = {}
    for qn in tqdm(queries_names):
        q = qbn[qn]
        for qw in q:
            query_word = iwi[qw]
            if query_word not in fasttext_vec_model.wv.vocab:
                continue
            # here I try to find the most similar terms to the stemmed word
            similar_words = [w[0] for w in fasttext_vec_model.most_similar(positive=[query_word], topn=10)]
            for w in similar_words:
                # stem the most similar words found in the model
                w = util.stem(w)
                if w in ii.keys():
                    if qn not in docs_to_rerank_by_qry.keys():
                        docs_to_rerank_by_qry[qn] = []
                    docs_to_rerank_by_qry[qn].extend([pl[0] for pl in ii[w]])

    return docs_to_rerank_by_qry
예제 #4
0
import argparse
import os
from util import os_command, stem

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='Input video.')
    parser.add_argument('save_dir', help='Save directory.')
    parser.add_argument(
        '--st_pos',
        default='00:00:10',
        help='Starting position in format hh:mm:ss[.xxx]. [00:00:10]')
    parser.add_argument('--fps', default='5', help='Frame per second. [5]')
    parser.add_argument('--max_frames',
                        default=200,
                        type=int,
                        help='Maxium number of frames. [200]')
    args = parser.parse_args()

    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)

    vid_stem = stem(args.input)
    command = [
        'ffmpeg', '-i', args.input, '-r', args.fps, '-ss', args.st_pos, '-f',
        'image2', '-vf', 'scale=-1:500', '-q:v', '2'
    ]
    if args.max_frames > 0:
        command += ['-vframes', args.max_frames]
    command += [os.path.join(args.save_dir, vid_stem + '_%04d.JPEG')]
    os_command(command)
예제 #5
0
   parser.add_argument('--ja_file', dest='ja_file', action='store', type=str, help='tokenized japanese file')
   parser.add_argument('--output', dest='output', action='store', type=str, help='outpu dict file')
   parser.add_argument('--stem', dest='stem', action='store_true', help='stem english')
   parser.add_argument('--nostop', dest='nostop', action='store_true', default=False, help='remove english stopwords')
   args = parser.parse_args()

   stopwords = []
   if args.nostop:
      stopwords = get_stopwords()
   nerr = 0
   with open(args.en_file, 'r') as fin_en, codecs.open(args.ja_file, 'r', encoding='utf-8') as fin_ja, codecs.open(args.output, 'w', encoding='utf-8') as fout:
      for en, ja in zip(fin_en, fin_ja):
         en = remove_punct(en.strip()).strip().split()
         if len(en) == 1:
            if args.stem:
               en = ' '.join(stem(en))
            else:
               en = ' '.join(en)
            if en in stopwords:
               continue
            ja = remove_punct(ja.strip()).strip().split()
            ja = ' '.join(ja)
            if re.match(r'\w+', ja):
               continue
            try:
               fout.write('%s @ %s\n' % (ja, en))
               #fout.write('%s <> %s\n' % (en, ja))
            except UnicodeDecodeError:
               nerr += 1
               continue
   print 'errors:', nerr
예제 #6
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input',
        help='Input video.')
    parser.add_argument('save_dir',
        help='Save directory.')
    parser.add_argument('--st_pos', default='00:00:10',
        help='Starting position in format hh:mm:ss[.xxx]. [00:00:10]')
    parser.add_argument('--fps', default='5',
        help='Frame per second. [5]')
    parser.add_argument('--max_frames', default=200, type=int,
        help='Maxium number of frames. [200]')
    args = parser.parse_args()


    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)

    vid_stem = stem(args.input)
    command = ['ffmpeg',
               '-i', args.input,
               '-r', args.fps,
               '-ss', args.st_pos,
               '-f', 'image2',
               '-vf', 'scale=-1:500',
               '-q:v', '2']
    if args.max_frames > 0:
        command += ['-vframes', args.max_frames]
    command += [os.path.join(args.save_dir, vid_stem + '_%04d.JPEG')]
    os_command(command)