Python IndexReader.stats 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyserini.index

클래스/타입: IndexReader

메소드/함수: stats

hotexamples.com에서의 예제들: 6

Python IndexReader.stats - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyserini.index.IndexReader.stats에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

IndexReader(16)

stats(6)

analyze(4)

doc(3)

compute_bm25_term_weight(2)

get_document_vector(2)

get_term_counts(2)

compute_query_document_score(1)

doc_raw(1)

terms(1)

예제 #1

파일 보기

def main(queries_file, qrels_file, output_file, write_negative):
    queries = read_topics(queries_file)
    index_reader = IndexReader('indexes/msmarco-passage')
    document_count = int(index_reader.stats()['documents'])
    qrels = open(qrels_file, 'r')

    with open(output_file, 'w') as output_file_handle:
        for line in qrels:
            line = line.strip().split('\t')

            qid = int(line[0])
            docid = line[2]
            target = line[3]
            query = queries[qid]['title']

            features = compute_features(index_reader, query, docid)
            output_file_handle.write(
                format_qrel_line(target, qid, features, docid))

            # The evaluation set doesn't need negative examples.
            if write_negative:
                negative_docid = str(get_negative_docid(document_count, docid))
                features = compute_features(index_reader, query,
                                            negative_docid)
                output_file_handle.write(
                    format_qrel_line(0, qid, features, negative_docid))

예제 #2

파일 보기

 def _compute_idf(index_path):
     from pyserini.index import IndexReader
     index_reader = IndexReader(index_path)
     tokens = []
     dfs = []
     for term in index_reader.terms():
         dfs.append(term.df)
         tokens.append(term.term)
     idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs))))
     return dict(zip(tokens, idfs))

예제 #3

파일 보기

파일: generate_libsvm.py 프로젝트: pilmus/ir-project

def compute_idf(query_terms: List[str],
                index_reader: IndexReader) -> np.ndarray:
    """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)"""
    C = index_reader.stats()['documents']

    query_idf = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        term_df = index_reader.get_term_counts(term, analyzer=None)[0]

        query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5))
    return query_idf

예제 #4

파일 보기

def main():
    try:
        # Location of the generated index
        index_loc = "indexes/msmarco-passage/lucene-index-msmarco"

        # Create a searcher object
        searcher = SimpleSearcher(index_loc)
        # Set the active scorer to BM25
        searcher.set_bm25(k1=0.9, b=0.4)
        # Fetch 3 results for the given test query
        results = searcher.search('this is a test query', k=3)
        # For all results print the docid and the score
        expected = ['5578280', '2016011', '7004677']
        docids = [x.docid for x in results]
        if expected != docids:
            raise Exception('Test query results do not match expected:',
                            expected, '(expecteD)', docids, '(actual)')
        # IndexReader can give information about the index
        indexer = IndexReader(index_loc)
        if indexer.stats()['total_terms'] != 352316036:
            raise Exception(
                'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?'
            )
        topics = get_topics("msmarco-passage-dev-subset")
        if topics == {}:
            raise Exception(
                'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.'
            )
        first_query = topics[list(topics.keys())[0]]['title']
        if first_query != "why do people grind teeth in sleep":
            raise Exception(
                'Found a different first query than expected in the dataset. Did you download the right dataset?'
            )
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        query = "This is a test query in which things are tested. Found using www.google.com of course!"
        # Tokenizing in pyserini is called Analyzing
        output = indexer.analyze(query)
        if len(output) != 9:
            raise Exception(
                'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.'
            )
    except Exception as inst:
        print('ERROR: something went wrong in the installation')
        print(inst)
    else:
        print("INSTALLATION OK")

예제 #5

파일 보기

파일: precompute_bm25.py 프로젝트: KaishuaiXu/CLEAR

parser.add_argument('--msmarco_dir', type=str, default="./data")
parser.add_argument('--index_dir', type=str, default="./data/index")
parser.add_argument('--output_dir', type=str, default="./data/bm25_result")
parser.add_argument('--bm25_k1', type=float, default=0.6)
parser.add_argument('--bm25_b', type=float, default=0.8)
parser.add_argument('--threads', type=int, default=4)
parser.add_argument('--sample', type=int, default=0)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

indexer = IndexReader(args.index_dir)
searcher = SimpleSearcher(args.index_dir)
searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b)
num_candidates = indexer.stats()['documents']


def calculate_bm25(query):
    qid, text = query
    with open(os.path.join(args.output_dir, f"{qid}.tsv"), 'w') as outfile:
        candidates = searcher.search(text, k=num_candidates)
        for i in range(len(candidates)):
            outfile.write(f"{candidates[i].docid}\t{candidates[i].score}\n")


if __name__ == "__main__":
    # load the queries
    queries = dict()
    for line in open(os.path.join(args.msmarco_dir, f"queries.dev.tsv"), 'r'):
        qid, query = line.split('\t')

예제 #6

파일 보기

파일: run-rpf-rm.py 프로젝트: khalidelhaji/ir

    'cf': 1005023
}, {
    'term': 'also',
    'cf': 991428
}, {
    'term': 'mai',
    'cf': 955836
}, {
    'term': 'most',
    'cf': 927327
}, {
    'term': 'about',
    'cf': 909980
}]

total_words = index_reader.stats()['total_terms']


def dirich(freq_term_in_doc,
           total_words_in_doc,
           freq_term_in_collection,
           total_words,
           mu=1000,
           log=True):
    output = 0

    if log:
        output = math.log(
            (freq_term_in_doc + mu * (freq_term_in_collection / total_words)) /
            (total_words_in_doc + mu))
    else: