Пример #1
0
def main():
    pp = pprint.PrettyPrinter()

    data_dir = 'data/easyCLEF08'
    txt_file = os.path.join(data_dir, 'easyCLEF08_text.txt')
    query_file = os.path.join(data_dir, 'easyCLEF08_query.txt')
    relevants_file = os.path.join(data_dir, 'easyCLEF08_gt.txt')

    index = Index(directory=data_dir, txt_file=txt_file, create_index=False)

    okapi = Okapi(index, k1=1.80, b=0.65)

    diversity = DiversityClustering(DBSCANClustering, index)

    q = QueryParser(relevants_file)
    q.q.initFile(query_file)

    # while True:
    for i in range(1):
        query = q.nextQuery()
        if query is None:
            break
        if len(query.relevants) == 0:
            print('No relevants docs')
            continue

        docs_scores = okapi.getRanking(query.getText())
        ordered_pred = diversity.order_pred(query, docs_scores)
def compare_models(names, models, measures=['PrecisionRecall', 'AveragePrecision']):
    rel_filename = 'cacm/cacm.rel'
    query_filename = 'cacm/cacm.qry'
    query_parser = QueryParser(rel_filename)
    query_parser.q.initFile(query_filename)

    train_queries, _ = query_parser.split_query_dataset()

    pp = pprint.PrettyPrinter()

    models_scores = {}

    queries_results = {name: [] for name in names}

    for query in train_queries:
        query_txt = query.getText()
        for name, model in zip(names, models):
            scores = model.getRanking(query_txt)
            results = IRList(query, scores)
            queries_results[name].append(results)

    for model_name in names:
        print('==== {} ===='.format(model_name))
        model_score = EvalIRModel(measures).eval_model(queries_results[model_name])
        models_scores[model_name] = model_score

    pp.pprint(models_scores)
    return models_scores
Пример #3
0
    def write_data(self):
        filename = 'data/train.dat'
        with open(filename, 'w') as file:
            query_parser = QueryParser('cacm/cacm.rel')
            query_parser.q.initFile('cacm/cacm.qry')
            train_queries, _ = query_parser.split_query_dataset()

            index = self.index
            featurers_count = len(self.featurers_list.featurers)
            docs_ids = index.docs.keys()
            N = len(docs_ids)

            random.seed(SEED)

            for _, query in enumerate(train_queries):
                queryId = query.getId()
                relevants = [str(d[1]) for d in query.relevants]
                n_relevants = len(relevants)

                for i, pertinent_doc in enumerate(relevants):
                    non_pertinent_doc = str(random.choice(docs_ids))
                    while non_pertinent_doc in relevants:
                        non_pertinent_doc = str(random.choice(docs_ids))

                    pertinent_doc_features = self.featurers_list.get_features(
                        pertinent_doc, query)
                    non_pertinent_doc_features = self.featurers_list.get_features(
                        non_pertinent_doc, query)
                    line1 = '{} qid:{}'.format(1, queryId)
                    for f_idx, f in enumerate(pertinent_doc_features):
                        line1 += ' %d:%.7f' % (f_idx, f)
                    line1 += '\n'
                    file.write(line1)
                    rank = i + 2
                    line2 = '{} qid:{}'.format(rank, queryId)
                    for f_idx, f in enumerate(non_pertinent_doc_features):
                        line2 += ' %d:%.7f' % (f_idx, f)
                    line2 += '\n'
                    file.write(line2)
Пример #4
0
    def train_linear_model(self,
                           alpha=10e-1,
                           lambda_=10e-5,
                           t_max=1000,
                           decay=10e-7):

        rel_filename = 'cacm/cacm.rel'
        query_filename = 'cacm/cacm.qry'
        query_parser = QueryParser(rel_filename)
        query_parser.q.initFile(query_filename)
        train_queries, _ = query_parser.split_query_dataset()
        iterator = itertools.chain.from_iterable(
            itertools.repeat(train_queries, t_max / len(train_queries)))

        index = self.index
        featurers_count = len(self.featurers_list.featurers)
        docs_ids = index.docs.keys()
        theta = np.ones(featurers_count)
        N = len(docs_ids)

        base_model = LanguageModel(index)

        def get_f(theta, features):
            return np.dot(theta, features)

        random.seed(SEED)

        main_losses = []
        losses = []

        for i, query in enumerate(iterator):

            alpha *= (1. / (1. + decay * i))
            relevants = [d[1] for d in query.relevants]

            non_pertinent_doc = str(random.choice(docs_ids))
            while non_pertinent_doc in relevants:
                non_pertinent_doc = str(random.choice(docs_ids))

            non_pertinent_docs = [non_pertinent_doc]

            # base_model_preds = [d for d,rank in base_model.getRanking(query.getText())[:20]]
            # non_pertinent_docs = [str(d) for d in base_model_preds if int(d) not in relevants]
            # non_pertinent_docs = random.sample(non_pertinent_docs, 1)

            for non_pertinent_doc in non_pertinent_docs:
                pertinent_doc = str(random.choice(relevants))
                pertinent_doc_features = np.array(
                    self.featurers_list.get_features(pertinent_doc, query))
                non_pertinent_doc_features = np.array(
                    self.featurers_list.get_features(non_pertinent_doc, query))
                f_pertinent_doc = get_f(theta, pertinent_doc_features)
                f_non_pertinent_doc = get_f(theta, non_pertinent_doc_features)
                loss = 1 - f_pertinent_doc + f_non_pertinent_doc
                losses.append(loss > 0)
                if loss > 0:
                    theta += alpha * (pertinent_doc_features -
                                      non_pertinent_doc_features)
                    theta *= (1 - lambda_ * np.linalg.norm(theta, 2))

            if i % 100 == 0:
                print(i)
                print('regul', (1 - lambda_ * np.linalg.norm(theta, 2)))
                print('lr', alpha)
                print(theta)
                t_loss = np.mean(losses)
                print('LOSS = ', t_loss)
                main_losses.append(t_loss)
                losses = []
        print(main_losses)
        plt.plot(list(range(len(main_losses))), main_losses)
        plt.title('Average loss')
        plt.savefig('plot/meta_model_loss_alpha_=0_5_decay_8')
        return theta
Пример #5
0
    index = Index()

    featurers_list = FeaturersList(index)

    model = LinearMetaModel(index, featurers_list)
    theta = model.train_linear_model(alpha=.02,
                                     t_max=800,
                                     lambda_=.001,
                                     decay=10e-8)
    # theta = [2.46331609, 8.24653025, 6.11832922, 0.59504227, -2.89611445, 4.59988454, 1.91155859, 1.62453584]

    random.seed(0)

    rel_filename = 'cacm/cacm.rel'
    query_filename = 'cacm/cacm.qry'
    query_parser = QueryParser(rel_filename)
    query_parser.q.initFile(query_filename)
    _, test_queries = query_parser.split_query_dataset()

    print('EVAL')
    queries = []
    for query in test_queries:
        scores = model.getRanking(query, theta)
        queries.append(IRList(query, scores))
    scores = EvalIRModel().eval_model(queries)
    print(scores)
    plt.figure(figsize=(10, 8))

    y = scores['PrecisionRecall']['mean']
    x = list(range(len(y)))
    plt.plot(x, y)
Пример #6
0
    #   Vectoriel(index, SimpleWeighter(index), normalized=True),
    #   Vectoriel(index, ThirdWeighter(index), normalized=True),
    #   Vectoriel(index, FourthWeighter(index), normalized=True),
    #   ]
    # models = [Okapi(index, k1=1.80, b=0.65), LanguageModel(index, lissage=.8)]

    # model = LanguageModel(index)

    # base_model = Okapi(index)

    # model1 = PageRank(index, seeds=5, k=2)

    models = [HITS(index, seeds=5, k=2), HITS(index, seeds=5, k=5)]

    for model in models:
        q = QueryParser(rel_filename)
        q.q.initFile(query_filename)
        queries = []
        while True:
            # for i in range(4):
            query = q.nextQuery()
            if query is None:
                break
            if len(query.relevants) == 0:
                print('No relevants docs')
                continue

            scores = model.getRanking(query.getText())

            queries.append(IRList(query, scores))
        print('___________')
Пример #7
0
    #diversity_models = [DiversityClustering(WithoutClustering, index, N=100),
    #                    DiversityClustering(RandomClustering, index, N=100),
    #                    DiversityClustering(KmeanClustering, index, N=100),
    #                    DiversityClustering(DBSCANClustering, index, N=100)]
    #names = ['Without', 'Random', 'Kmeans', 'DBSCAN']

    diversity_models = [DiversityClustering(KmeanClustering, index, N=100)]
    names = ['Kmeans']

    models_scores = {}

    for diversity, name in zip(diversity_models, names):
        print('Evaluation of %s clustering'%(name))

        q = QueryParser(relevants_file)
        q.q.initFile(query_file)
        predicted_docs = []

        while True:
        # for i in range(3):
            query = q.nextQuery()

            if query is None:
                break
            if len(query.relevants)==0:
                print('No relevants docs')
                continue

            docs_scores = Okapi(index, k1=1.80, b=0.65).getRanking(query.getText())
            ordered_pred = diversity.order_pred(query, docs_scores, cluster_order='relevance')