예제 #1
0
 def get_top_k_docs(self, query):
     query_words = p.preprocess_query(query)  # preprocessing query
     if len(query_words) > 10:  # long query search
         r = Rake(min_length=1, max_length=4)
         r.extract_keywords_from_text(query)
         phrases = list(set(' '.join(r.get_ranked_phrases()).split()))
         query_words = p.preprocess_query(' '.join(phrases))
     top_k_docs = self.model.get_top_n(query_words, self.corpus,
                                       100)  # get top 100 docs
     insensitive_comparers = {}
     for qw in query_words:
         insensitive_comparers[qw] = re.compile(re.escape(qw),
                                                re.IGNORECASE)
     results = {
         'titles': [],
         'abstracts': [],
         'ids': [],
         'authors': [],
         'links': [],
         'category': []
     }
     for i in top_k_docs:
         abstract = i['abstract'].replace('\n', '')
         title = i['title'].replace('\n', '')
         authors = i['authors'].replace('\n', '')
         id = i['id']
         category = self.cat_data.iloc[id]['pred_category']
         if abstract == '' or title == '' or authors == '':
             continue
         abstract = p.remove_punctuations(abstract)
         doc_text = title.lower() + ' ' + abstract.lower(
         ) + ' ' + authors.lower()
         query_words_found = False
         for qw in query_words:
             if qw in doc_text:
                 query_words_found = True
                 break
         if not query_words_found:
             continue
         for qw in query_words:
             abstract = insensitive_comparers[qw].sub(
                 '<b>' + qw + '</b>', abstract)
         results['titles'].append(title.title())
         results['authors'].append(authors)
         results['abstracts'].append(abstract)
         results['ids'].append(i['id'])
         results['links'].append(i['link'])
         results['category'].append(category)
     return (results)
예제 #2
0
    def store_relevance_judgements(self, query, doc_id, rel_score):
        """
        Args:
            query: str
            doc_id: unique ID corresponding to a document
            rel_score: -1 (not rel) or 1 (rel)
        """
        if rel_score not in ['-1', '1']:
            print('Invalid Relevance Feedback:', rel_score)
            return

        with open(self.relevance_scores_file, 'a') as f:
            f.write(','.join(
                [str(i)
                 for i in (query.replace(',', ''), doc_id, rel_score)]) + '\n')

        query_words = preprocess_query(query)
        if len(query_words) > 10:  # long query search
            r = Rake(min_length=1, max_length=4)
            r.extract_keywords_from_text(query)
            phrases = r.get_ranked_phrases()
            query_words = ' '.join(phrases).split()

        self.relevance_scores[(tuple(query_words),
                               int(doc_id))] += int(rel_score)
def get_top_k_docs(model, query, corpus, k=100):
    """
    Args:
        model: Search Engine that has `get_top_n(tokenized_query, corpus, n=k)` method
        query: string
        k: int (default: 1)
    
    Returns:
        top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance
    """
    tokenized_query = preprocess_query(query)
    query_words = preprocess_query(query,
                                   stemming=False,
                                   lower_case=True,
                                   lemma=False,
                                   stopword_removal=True)
    top_k_docs = model.get_top_n(tokenized_query, corpus, n=k)

    results = {'titles': [], 'abstracts': [], 'ids': []}
    for i in top_k_docs:
        abstract = i['abstract'].replace('\n', '')
        if abstract == '':
            abstract = i['introduction'].replace('\n', '')
        if abstract == '':
            continue

        doc_text = i['title'].replace(
            '\n', '').lower() + ' ' + i['abstract'].replace(
                '\n', '').lower() + ' ' + i['introduction'].replace(
                    '\n', '').lower()
        query_words_found = False
        for qw in query_words:
            if qw in doc_text:
                query_words_found = True
                break
        if not query_words_found:
            continue

        results['abstracts'].append(abstract)
        results['ids'].append(i['id'])
        results['titles'].append(i['title'].replace('\n', ''))

    return results
예제 #4
0
def find(query, result_count):
    con = db.connection()

    start_time = time.perf_counter_ns()
    query_tokens = pp.preprocess_query(query)
    results = db.get_all_multiword_postings(con, query_tokens)
    end_time = time.perf_counter_ns()
    search_time = round((end_time - start_time) / 1000000)

    result_count = min(len(results), result_count)
    tab = ' ' * 2
    freq = 'Frequencies'
    doc = 'Document'
    snip = 'Snippet'
    longest_name = 0
    for i in range(result_count):
        if len(results[i].document_name) > longest_name:
            longest_name = len(results[i].document_name)
    longest_name = max(longest_name, len(doc))

    print(f"Results for query: \"{query}\"")
    print()
    print(f"{tab}Results found in {search_time}ms.")
    print(f"{tab}{freq} {doc}{' ' * (longest_name - len(doc))} {snip}")
    print()
    print(f"{tab}{'-' * len(freq)} {'-' * longest_name} {'-' * longest_name}")
    for i in range(result_count):
        result = results[i]
        snippets = create_snippets(result, 3)
        snippet_string = ''
        for j in range(len(snippets)):
            snippet = snippets[j]
            if snippet.front:
                if j == 0:
                    snippet_string += '... '
                elif not snippets[j - 1].back:
                    snippet_string += '... '
            snippet_string += snippet.text
            if snippet.back:
                snippet_string += ' ... '
            else:
                snippet_string += ' '
        print(f"{tab}{result.frequency_sum}{' ' * (len(freq) - len(str(result.frequency_sum)))} {result.document_name}"
              f"{' ' * (longest_name - len(result.document_name))} {snippet_string}")
    print()
예제 #5
0
    def __init__(self, model, corpus, relevance_scores_file):
        """
        model: Search Engine that has `get_top_n(query_words, corpus, n=k)` method
        relevance_scores_file: stores (query, doc_id, rel_score) in this file
        """
        self.model = model
        self.corpus = corpus
        self.relevance_scores_file = relevance_scores_file

        self.relevance_scores = defaultdict(int)
        with open(self.relevance_scores_file, 'r') as f:
            for line in f:
                query, doc_id, rel_score = line.replace('\n', '').split(',')

                query_words = preprocess_query(query)
                if len(query_words) > 10:  # long query search
                    r = Rake(min_length=1, max_length=4)
                    r.extract_keywords_from_text(query)
                    phrases = r.get_ranked_phrases()
                    query_words = ' '.join(phrases).split()

                self.relevance_scores[(tuple(query_words),
                                       int(doc_id))] += int(rel_score)
예제 #6
0
    def get_top_k_docs(self, query, k=100):
        """
        Args:
            query: string
            k: int (default: 1)
        
        Returns:
            top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance
        """
        query_words = preprocess_query(query)
        if len(query_words) > 10:  # long query search
            r = Rake(min_length=1, max_length=4)
            r.extract_keywords_from_text(query)
            phrases = list(set(' '.join(r.get_ranked_phrases()).split()))
            query_words = preprocess_query(' '.join(phrases))

        top_k_docs = self.model.get_top_n(query_words, self.corpus, n=k)

        insensitive_comparers = {}
        for qw in query_words:
            insensitive_comparers[qw] = re.compile(re.escape(qw),
                                                   re.IGNORECASE)

        results = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        not_relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        for i in top_k_docs:
            abstract = i['abstract'].replace('\n', '')
            if abstract == '':
                abstract = i['introduction'].replace('\n', '')
            if abstract == '':
                continue

            abstract = remove_punctuations(abstract)

            title = i['title'].replace('\n', '')
            if title == '':
                continue

            doc_text = title.lower() + ' ' + abstract.lower(
            ) + ' ' + i['introduction'].replace('\n', '').lower()
            query_words_found = False
            for qw in query_words:
                if qw in doc_text:
                    query_words_found = True
                    break
            if not query_words_found:
                continue

            # Bold mark query words in abstract
            for qw in query_words:
                abstract = insensitive_comparers[qw].sub(
                    '<b>' + qw + '</b>', abstract)

            rel_score = self.relevance_scores[(tuple(query_words), i['id'])]
            if rel_score > 0:
                relevant['titles'].append(title.title())
                relevant['abstracts'].append(abstract)
                relevant['ids'].append(i['id'])
                relevant['links'].append(i['link'])
            elif rel_score < 0:
                not_relevant['titles'].append(title.title())
                not_relevant['abstracts'].append(abstract)
                not_relevant['ids'].append(i['id'])
                not_relevant['links'].append(i['link'])
            else:
                results['titles'].append(title.title())
                results['abstracts'].append(abstract)
                results['ids'].append(i['id'])
                results['links'].append(i['link'])

        for key in ['abstracts', 'ids', 'titles', 'links']:
            results[key] = relevant[key] + results[key] + not_relevant[key]

        return results