예제 #1
0
    def populate_frame(self, date_range, term_vector) -> pd.DataFrame:
        data_frame = pd.DataFrame(data=0,
                                  index=date_range,
                                  columns=term_vector)
        iterator = self.lucene_dictionary.getEntryIterator()

        for term in BytesRefIterator.cast_(iterator):
            term_as_string = term.utf8ToString()
            # print('term:', term_as_string)
            query = QueryParser("contents",
                                self.analyzer).parse(term_as_string)
            collector = TopScoreDocCollector.create(10000, 10000)
            hits = self.searcher.search(query, 1000)

            if hits is None:
                # print("No hit for term: ", term_as_string)
                continue

            print("Found hit: " + term_as_string)

            for hit in hits.scoreDocs:
                document = self.searcher.doc(hit.doc)

                doc_name = document.getField("doc_name")
                date = datetime.datetime.strptime(doc_name.stringValue(),
                                                  '%m%d%y')

                current_value = data_frame.at[date, term_as_string]
                if np.isnan(current_value):
                    current_value = 0
                data_frame.at[date, term_as_string] = current_value + 1

        return data_frame
예제 #2
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
예제 #3
0
    def get_terms(self) -> List[str]:
        iterator = self.lucene_dictionary.getEntryIterator()

        map_iterator = map(lambda term: term.utf8ToString(),
                           BytesRefIterator.cast_(iterator))

        return list(map_iterator)
예제 #4
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if (terms is not None):
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
예제 #5
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
            "if", "no", "not", "more", "http", "html", "of", "on", "or",
            "such", "that", "the", "their", "then", "there", "these", "they",
            "this", "to", "was", "will", "with", "el", "la", "lo", "los",
            "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y",
            "los"
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT,
                                              stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode('UTF-8')
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
예제 #6
0
def evaluate_index(index_dir, context, analyzer):
    # eval time of indexing (overall)
    # we should also measure the elapsed time of
    # each index_document call seperately
    start = time.clock()
    Indexer(index_dir, context, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(index_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)


    # print str(vocab_size) # size of vocabulary
    # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
    # print str(vocabulary.getSumTotalTermFreq()) # #tokens
    # print str(vocabulary.getSumDocFreq()) # #postings

    reader.close()
    return duration, vocab_size
예제 #7
0
 def get_doc_termvector(self, lucene_doc_id, field):
     """Outputs the document term vector as a generator."""
     terms = self.reader.getTermVector(lucene_doc_id, field)
     if terms:
         termenum = terms.iterator(None)
         for bytesref in BytesRefIterator.cast_(termenum):
             yield bytesref.utf8ToString(), termenum
예제 #8
0
    def getTermVectors(route):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []
        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)
        ls = []
        for doc in range(ireader.numDocs()):
            vector = FreqVector()
            vector.vector = []
            vector.freqs = []

            norm = 0.0
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    tf = 1 + math.log(termsEnum.totalTermFreq(), 2)
                    t = Term("content", term)
                    idf = math.log(ireader.numDocs() / ireader.docFreq(t))
                    vector.vector.append(text)
                    vector.freqs.append(tf * idf)
                    norm += (tf * idf) * (tf * idf)
                ls.append((vector, math.sqrt(norm)))
            else:
                ls.append((vector, 0))
        return ls
 def get_doc_termvector(self, lucene_doc_id, field):
     """Outputs the document term vector as a generator."""
     terms = self.reader.getTermVector(lucene_doc_id, field)
     if terms:
         termenum = terms.iterator(None)
         for bytesref in BytesRefIterator.cast_(termenum):
             yield bytesref.utf8ToString(), termenum
예제 #10
0
    def getMostFrequentTermNoStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString()
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq
        return currentTerm
예제 #11
0
파일: tirza.py 프로젝트: komax/tirza
def read_vectors():
    for doc in range(0, reader.numDocs()):
        for fieldName in FIELD_NAMES:
            terms = reader.getTermVector(doc, fieldName)
            if terms:
                termsEnum = terms.iterator(None)
                vectors[fieldName][doc] = \
                    set(term.utf8ToString() for term in BytesRefIterator.cast_(termsEnum))
예제 #12
0
 def get_coll_termvector(self, field):
     """ Returns collection term vector for the given field."""
     self.open_reader()
     fields = MultiFields.getFields(self.reader)
     if fields is not None:
         terms = fields.terms(field)
         if terms:
             termenum = terms.iterator(None)
             for bytesref in BytesRefIterator.cast_(termenum):
                 yield bytesref.utf8ToString(), termenum
 def get_coll_termvector(self, field):
     """ Returns collection term vector for the given field."""
     self.open_reader()
     fields = MultiFields.getFields(self.reader)
     if fields is not None:
         terms = fields.terms(field)
         if terms:
             termenum = terms.iterator(None)
             for bytesref in BytesRefIterator.cast_(termenum):
                 yield bytesref.utf8ToString(), termenum
예제 #14
0
 def get_term_freq(self, docid, field, terms=None):
     if terms is None:
         terms = self.reader.getTermVector(docid, field)
     term_freq = {}
     if terms is not None:
         te_itr = terms.iterator()
         for bytesref in BytesRefIterator.cast_(te_itr):
             t = bytesref.utf8ToString()
             freq = te_itr.totalTermFreq()
             term_freq[t] = freq
     return term_freq
예제 #15
0
def getLabelsandTerms(ireader, numDocs):
    labels = []
    terms_list = []
    for doc in xrange(0, numDocs):
        tv = ireader.getTermVector(doc, "contents")
        document = ireader.document(doc)
        topic = document.getField("topic")
        labels.append(topic.stringValue())
        termsEnum = tv.iterator()
        for term in BytesRefIterator.cast_(termsEnum):
            curr_term = term.utf8ToString()
            terms_list.append(curr_term)
    return labels, terms_list
예제 #16
0
 def getTFForField(self, field):
     tfs = {}
     fields = MultiFields.getFields(self.reader)
     terms = fields.terms(field)
     enum = BytesRefIterator.cast_(terms.iterator(None))
     try:
       while enum.next():
             termval = TermsEnum.cast_(enum)
             termString  = termval.term().utf8ToString()
             freq    = self.reader.totalTermFreq(Term(field, termString))
             tfs[termString] = freq
     except:
         pass
     return tfs
예제 #17
0
def getTermFrequencyMatrix(ireader, num_docs, te):
    feature_mat = np.zeros([num_docs, len(te.classes_)])
    for doc in xrange(0, num_docs):
        print "Running For Document number:" + str(doc)
        tv = ireader.getTermVector(doc, "contents")
        termsEnum = tv.iterator()
        for term in BytesRefIterator.cast_(termsEnum):
            str_term = term.utf8ToString()
            dpEnum = termsEnum.postings(None)
            dpEnum.nextDoc()
            freq = dpEnum.freq()

            term_ind = te.transform([str_term])[0]
            feature_mat[doc][term_ind] = freq
    return feature_mat
예제 #18
0
def main(storeDir):
    reader = DirectoryReader.open(storeDir)
    numDocs = reader.numDocs()
    print("n_docs:", numDocs)

    for i in range(numDocs):
        tvec = reader.getTermVector(i, 'body')
        if tvec is not None:
            termsEnum = tvec.iterator()
            vec = {}
            for term in BytesRefIterator.cast_(termsEnum):
                dpEnum = termsEnum.postings(None)
                dpEnum.nextDoc()
                vec[term.utf8ToString()] = dpEnum.freq()
            print(vec)

    reader.close()
예제 #19
0
    def test_bug1842(self):

        reader = self.getReader()
        searcher = self.getSearcher()
        q = TermQuery(Term("id", '1'))
        topDocs = searcher.search(q, 50)

        termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")
        terms = []
        freqs = []
        termsEnum = termvec.iterator()
        for term in BytesRefIterator.cast_(termsEnum):
            terms.append(term.utf8ToString())
            freqs.append(termsEnum.totalTermFreq())
        terms.sort()
        self.assert_(terms == ['blah', 'gesundheit'])
        self.assert_(freqs == [3, 1])
예제 #20
0
    def test_bug1842(self):

        reader = self.getReader()
        searcher = self.getSearcher()
        q = TermQuery(Term("id", '1'))
        topDocs = searcher.search(q, 50)

        termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")
        terms = []
        freqs = []
        termsEnum = termvec.iterator()
        for term in BytesRefIterator.cast_(termsEnum):
            terms.append(term.utf8ToString())
            freqs.append(termsEnum.totalTermFreq())
        terms.sort()
        self.assert_(terms == ['blah', 'gesundheit'])
        self.assert_(freqs == [3, 1])
예제 #21
0
    def get_term_freq(self, docid, field, is_cached=False):
        if is_cached == True and (field, docid) in self.dict_term_freq:
            return self.dict_term_freq[(field, docid)]

        if len(self.dict_term_freq) > 2000:
            self.dict_term_freq.clear()

        terms = self.reader.getTermVector(docid, field)
        term_freq = {}
        if terms is not None:
            te_itr = terms.iterator()
            for bytesref in BytesRefIterator.cast_(te_itr):
                t = bytesref.utf8ToString()
                freq = te_itr.totalTermFreq()
                term_freq[t] = freq

        self.dict_term_freq[(field, docid)] = term_freq
        return self.dict_term_freq[(field, docid)]
예제 #22
0
def get_terms(indexReader, field='text'):
    """
    Gets all terms in an index.

    :param indexReader: IndexReader object of your index
    :param field: document field from which terms should be counted
    :return: list of terms (strings)
    """
    terms = []
    multiterms = MultiFields.getTerms(indexReader, field)
    termit = multiterms.iterator()
    it = BytesRefIterator.cast_(
        termit)  # Inheritance apparently doesn't work in PyLucene...
    term = it.next()
    while term:
        terms.append(term.utf8ToString())
        term = it.next()
    return terms
예제 #23
0
    def getFreqVectorFromText(self, text):
        # Initialization of Java Virtual Machine with Lucene
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        indexDir = "res/index"

        stopWords = []

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)

        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(indexDir))
        conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
        rebuild = True
        if rebuild:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        else:
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        iwriter = IndexWriter(directory, conf)

        doc = Document()
        doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        iwriter.addDocument(doc)
        iwriter.close()

        ireader = IndexReader.open(directory)

        freqVector = []
        docVector = ireader.getTermVector(0, "content")

        termsEnum = docVector.iterator(None)
        for term in BytesRefIterator.cast_(termsEnum):
            text = term.utf8ToString()
            freq = termsEnum.totalTermFreq()
            freqVector.append((text, freq))

        freqVector = sorted(freqVector, key=itemgetter(1), reverse=True)
        self.vector = list()
        self.freqs = list()
        for el in freqVector:
            self.vector.append(el[0])
            self.freqs.append(el[1])
예제 #24
0
    def test_FieldEnumeration(self):
        self.test_indexDocument()

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()

            writer = self.getWriter(store, analyzer, False)
            doc = Document()
            doc.add(Field("title", "value of testing",
                          TextField.TYPE_STORED))
            doc.add(Field("docid", str(2),
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)

            doc = Document()
            doc.add(Field("owner", "unittester",
                          StringField.TYPE_NOT_STORED))
            doc.add(Field("search_name", "wisdom",
                          StoredField.TYPE))
            doc.add(Field("meta_words", "rabbits are beautiful",
                          TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            term_enum = MultiFields.getTerms(reader, "docid").iterator()
            docids = [term.utf8ToString()
                      for term in BytesRefIterator.cast_(term_enum)]
            self.assertEqual(len(docids), 2)
        finally:
            self.closeStore(store, reader)
예제 #25
0
    def get_doc_termfreqs_all_fields(self, lucene_doc_id):
        """
        Returns term frequency for all fields in the given document.

        :param lucene_doc_id: Lucene document ID
        :return: dictionary {field: {term: freq, ...}, ...}
        """
        doc_termfreqs = {}
        vectors = self.reader.getTermVectors(lucene_doc_id)
        if vectors:
            for field in vectors.iterator():
                doc_termfreqs[field] = {}
                terms = vectors.terms(field)
                if terms:
                    termenum = terms.iterator(None)
                    for bytesref in BytesRefIterator.cast_(termenum):
                        doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq())
                    print doc_termfreqs[field]
        return doc_termfreqs
예제 #26
0
    def test_FieldEnumeration(self):
        self.test_indexDocument()

        store = self.openStore()
        writer = None
        try:
            analyzer = self.getAnalyzer()

            writer = self.getWriter(store, analyzer, False)
            doc = Document()
            doc.add(Field("title", "value of testing", TextField.TYPE_STORED))
            doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED))
            doc.add(Field("owner", "unittester", StringField.TYPE_STORED))
            doc.add(Field("search_name", "wisdom", StoredField.TYPE))
            doc.add(
                Field("meta_words", "rabbits are beautiful",
                      TextField.TYPE_NOT_STORED))

            writer.addDocument(doc)

            doc = Document()
            doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED))
            doc.add(Field("search_name", "wisdom", StoredField.TYPE))
            doc.add(
                Field("meta_words", "rabbits are beautiful",
                      TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        reader = None
        try:
            reader = DirectoryReader.open(store)
            term_enum = MultiFields.getTerms(reader, "docid").iterator()
            docids = [
                term.utf8ToString()
                for term in BytesRefIterator.cast_(term_enum)
            ]
            self.assertEqual(len(docids), 2)
        finally:
            self.closeStore(store, reader)
예제 #27
0
 def termsForField(self, field, prefix=None, limit=10, **kwargs):
     convert = lambda term: term.utf8ToString()
     terms = []
     termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
     if termsEnum is None:
         return terms
     iterator = termsEnum.iterator(None)
     if prefix:
         iterator.seekCeil(BytesRef(prefix))
         terms.append((iterator.docFreq(), convert(iterator.term())))
     bytesIterator = BytesRefIterator.cast_(iterator)
     try:
         while len(terms) < limit:
             term = convert(bytesIterator.next())
             if prefix and not term.startswith(prefix):
                 break
             terms.append((iterator.docFreq(), term))
     except StopIteration:
         pass
     return terms
예제 #28
0
    def get_tf_idf(self, field_name: str, content_id: str):
        """
        Calculates the tf-idf for the words contained in the field of the content whose id
        is content_id

        Args:
            field_name (str): Name of the field containing the words for which calculate the tf-idf
            content_id (str): Id of the content that contains the specified field

        Returns:
             words_bag (Dict <str, float>):
             Dictionary whose keys are the words contained in the field,
             and the corresponding values are the tf-idf values.
        """
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory))))
        query = QueryParser("testo_libero",
                            KeywordAnalyzer()).parse("content_id:\"" +
                                                     content_id + "\"")
        score_docs = searcher.search(query, 1).scoreDocs
        document_offset = -1
        for score_doc in score_docs:
            document_offset = score_doc.doc

        reader = searcher.getIndexReader()
        words_bag = {}
        term_vector = reader.getTermVector(document_offset, field_name)
        term_enum = term_vector.iterator()
        for term in BytesRefIterator.cast_(term_enum):
            term_text = term.utf8ToString()
            postings = term_enum.postings(None)
            postings.nextDoc()
            term_frequency = 1 + math.log10(
                postings.freq())  # normalized term frequency
            inverse_document_frequency = math.log10(
                reader.maxDoc() / reader.docFreq(Term(field_name, term)))
            tf_idf = term_frequency * inverse_document_frequency
            words_bag[term_text] = tf_idf

        reader.close()
        return words_bag
예제 #29
0
def get_document_vector(searcher, reader, document_id, \
      id_field, text_field):
    ''' 
		Given a document id, fetch the tf-idf vector of the document.
	'''
    tc_dict = {}  # Counts of each term
    dc_dict = {}  # Number of docs associated with each term
    tfidf_dict = {}  # TF-IDF values of each term in the doc
    # Get the document id.
    query_parser = QueryParser(id_field, WhitespaceAnalyzer())
    score_docs = searcher.search(query_parser.parse(str(document_id)),
                                 1).scoreDocs
    if len(score_docs) > 0:
        # get the tf-idf vector.
        termVector = reader.getTermVector(score_docs[0].doc, text_field)
        termsEnumvar = termVector.iterator()
        termsref = BytesRefIterator.cast_(termsEnumvar)
        N_terms = 0
        try:
            while (termsref.next()):
                termval = TermsEnum.cast_(termsref)
                fg = termval.term().utf8ToString()  # Term in unicode
                if len(fg) > 3 and not fg.isdigit():
                    tc = termval.totalTermFreq()  # Term count in the doc

                    # Number of docs having this term in the index
                    dc = reader.docFreq(Term(text_field, termval.term()))
                    N_terms = N_terms + 1
                    tc_dict[fg] = tc
                    dc_dict[fg] = dc
        except:
            print('error in term_dict')

        # Compute TF-IDF for each term
        for term in tc_dict:
            tf = tc_dict[term] / N_terms
            idf = 1 + math.log(reader.numDocs() / (dc_dict[term] + 1))
            tfidf_dict[term] = tf * idf

    return tfidf_dict
예제 #30
0
def evaluate_index(data_dir, store_dir, analyzer):
    """
    Evaluates vocabulary size and indexing speed for different
    analyzer configurations.
    """
    start = time.clock()
    Indexer(data_dir, store_dir, analyzer)
    end = time.clock()
    duration = end-start

    directory = SimpleFSDirectory(File(store_dir))
    reader = IndexReader.open(directory)
    vocabulary = MultiFields.getTerms(reader, 'title')
    vocab_size = vocabulary.size()

    # sometimes .size() doesn't return the correct size, in this case
    # we have to count manually
    if vocab_size == -1:
        termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
        vocab_size = sum(1 for _ in termsref)

    reader.close()
    return duration, vocab_size
예제 #31
0
def main():
    #constants
    FIELD_CONTENTS = "vectext"
    DOC_NAME = "identifier"
    STORE_DIR = "../full_index1"

    lucene.initVM()
    store = SimpleFSDirectory(Paths.get(STORE_DIR))

    ireader = DirectoryReader.open(store)  #, True)
    #print(ireader.readerIndex(0))

    searcher = IndexSearcher(ireader)  #self.getSearcher()

    pickle_file = glob.glob('full_word_list.pkl')
    print(pickle_file)
    date_range = (1785, 1805)

    bigrams = False
    remake_word_list = True
    if remake_word_list:  #not pickle_file:

        full_df = get_full_df()
        full_term_data = []
        for year in range(date_range[0], date_range[1]):
            docs_in_year = get_docs_in_year(full_df, year)
            #print(docs_in_year)
            year_dict = Counter({})
            terms = []
            freqs = []
            print(year)
            for cd, doc_id in enumerate(docs_in_year):
                #if not cd%100:
                #    print(cd , '--', len(docs_in_year))
                # get document (query by id)
                q = TermQuery(Term("identifier", doc_id + '_djvu.txt'))
                topDocs = searcher.search(q, 50000)

                #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")
                one_doc = topDocs.scoreDocs[0].doc
                doc_name = searcher.doc(one_doc)
                #print(doc_name, doc_id)

                if bigrams == False:
                    termvec = ireader.getTermVector(topDocs.scoreDocs[0].doc,
                                                    FIELD_CONTENTS)

                    if termvec != None:
                        #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all")

                        termsEnum = termvec.iterator()
                        for term in BytesRefIterator.cast_(termsEnum):
                            terms.append(term.utf8ToString())
                            freqs.append(termsEnum.totalTermFreq())
                else:
                    #print(doc_name, doc_id)
                    text = doc_name.get("text")
                    text = text.split()
                    text = strip_stopwords_punc(text)
                    for word1, word2 in zip(text[:-1], text[1:]):
                        if len(word1) + len(word2) > 6:
                            try:
                                year_dict[word1 + ' ' + word2] += 1
                            except:
                                year_dict[word1 + ' ' + word2] = 1
            if bigrams == False:
                for term, freq in zip(terms, freqs):
                    try:
                        year_dict[term] += freq
                    except:
                        year_dict[term] = freq
            print(len(year_dict))
            #print(year_dict)
            for term in list(year_dict):
                if year_dict[term] < 2:  #5 and term not in stopwords:
                    year_dict.pop(term)
            full_term_data.append(year_dict)
            print(len(year_dict))
            #year_dict = year_dict + doc_dict
            #print(year_dict.most_common(1000))
            print('\n\n')
        if bigrams:
            pickle.dump(full_term_data, open('full_bigram_list.pkl', 'wb'))
        else:
            pickle.dump(full_term_data, open('full_word_list.pkl', 'wb'))
    else:
        if bigrams:
            full_term_data = pickle.load(open('full_bigram_list.pkl', 'rb'))
        else:
            full_term_data = pickle.load(open('full_word_list.pkl', 'rb'))
        # get complete list of unique words
        # top_words_year = zscore_method(full_term_data, date_range)

        top_words_year = tfidf_method(full_term_data, date_range)
        print(top_words_year)
    pickle.dump(top_words_year, open('trending_ratio.pkl', 'wb'))
예제 #32
0
ts = [
    "this bernhard is the text to be index text",
    "this claudia is the text to be indexed"
]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator()

    for term in BytesRefIterator.cast_(termsEnum):
        dpEnum = termsEnum.postings(None)
        dpEnum.nextDoc()  # prime the enum which works only for the current doc
        freq = dpEnum.freq()

        print 'term:', term.utf8ToString()
        print '  freq:', freq

        for i in xrange(freq):
            print "  pos:", dpEnum.nextPosition()
            print "  off: %i-%i" % (dpEnum.startOffset(), dpEnum.endOffset())
    print
예제 #33
0
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator()

    for term in BytesRefIterator.cast_(termsEnum):
        dpEnum = termsEnum.postings(None)
        dpEnum.nextDoc()  # prime the enum which works only for the current doc
        freq = dpEnum.freq()

        print 'term:', term.utf8ToString()
        print '  freq:', freq

        for i in xrange(freq):
            print "  pos:", dpEnum.nextPosition()
            print "  off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset())
    print
예제 #34
0
    def getMostFrequentTermStopwords(route, query):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        stopWords = [
            "a",
            "an",
            "and",
            "are",
            "as",
            "at",
            "be",
            "but",
            "by",
            "for",
            "if",
            "no",
            "not",
            "more",
            "http",
            "html",
            "of",
            "on",
            "or",
            "such",
            "that",
            "the",
            "their",
            "then",
            "there",
            "these",
            "they",
            "this",
            "to",
            "was",
            "will",
            "with",
            "el",
            "la",
            "lo",
            "los",
            "las",
            "ante",
            "con",
            "sin",
            "que",
            "es",
            "de",
            "en",
            "por",
            "y",
            "los",
        ]

        stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords)
        analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
        directory = SimpleFSDirectory(File(route))

        ireader = IndexReader.open(directory)

        currentTerm = ""
        currentTermFreq = 0
        for doc in range(ireader.numDocs()):
            terms = ireader.getTermVector(doc, "content")
            if terms is not None:
                termsEnum = terms.iterator(None)
                for term in BytesRefIterator.cast_(termsEnum):
                    text = term.utf8ToString().encode("UTF-8")
                    t = Term("content", term)
                    freq = ireader.totalTermFreq(t)
                    if freq > currentTermFreq and text not in query:
                        currentTerm = text
                        currentTermFreq = freq

        return currentTerm
예제 #35
0
def run(reader, searcher, analyzer, searchTerm, K):
    #get user's query and initiate some dictionaries
    command = searchTerm
    #for each word, record how many matched documents contains it
    termOccurrence = {}
    #inversed document frequency for each word
    idf = {}
    #term frequency for each word in each matched document
    tf = {}
    #the product of tf and idf for each word in each matched document
    tfidf = {}
    #the number of words for each matched document
    docLength = {}
    #for each kind of word in each document,record how many times it appeared in the document
    termOccurrenceInADoc = {}
    #record every kind of word in matched documents as key, the value of this dictionary doesn't matter
    allWords = {}

    #parse the user query
    query = QueryParser("contents", analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    #the total number of matched documents
    totalDocs = 0

    #for each matched document,calculate its all term's term normalized term frequency
    for scoreDoc in scoreDocs:
        totalDocs = totalDocs + 1
        #get url and term vectors of the matched document
        doc = searcher.doc(scoreDoc.doc)
        vectors = reader.getTermVector(scoreDoc.doc, "contents")
        enum = vectors.iterator()
        url = doc.get("url")
        #record the url of matched document, create a nested dictionary for term occurence of a word in a document
        termOccurrenceInADoc[url] = {}
        #record the url of matched document, create a nested dictionary for term frequency
        tf[url] = {}
        #for each term in matched document, record each term's occurence and calculate the total word count of the document
        for term in BytesRefIterator.cast_(enum):
            term2 = term.utf8ToString()
            #record the term
            allWords[term2] = 1
            #increase the termOccurence by one
            if termOccurrence.has_key(term2):
                termOccurrence[term2] = termOccurrence[term2] + 1
            else:
                termOccurrence[term2] = 1
            dpEnum = enum.postings(None)
            dpEnum.nextDoc()
            #get occurence of the term
            freq = dpEnum.freq()
            #add the occurence of the term into the total word count of the document
            if docLength.has_key(url):
                docLength[url] = docLength[url] + freq
            else:
                docLength[url] = freq
            #record occurence of this term in the according docuement
            termOccurrenceInADoc[url][term2] = freq
        #for each term in each document,divide its occurence by the total length of the document to get the normalized term frequency
        for key in termOccurrenceInADoc[url]:
            tf[url][key] = termOccurrenceInADoc[url][key] / float(
                docLength[url])
    #calculate idf for each term
    for key in termOccurrence:
        idf[key] = math.log(float(totalDocs) / termOccurrence[key], 10)
    #calculate the product of tf and idf for each term in each document
    for key in tf:
        tfidf[key] = {}
        for word in allWords:
            if tf[key].has_key(word):
                tfidf[key][word] = tf[key][word] * idf[word]
    k = K
    #this is used to record the attributes of all centroids
    centroids = {}
    #this is used to record all urls of each cluster
    clusters = {}
    #copy the tfidf dictionary, because I need to delete some keys to make sure I choose different initial centroids
    copy = tfidf.copy()
    #randomly choose k vectors as initial centroid
    for counter in range(1, k + 1):
        if len(copy) == 0:
            break
        key = random.choice(copy.keys())
        centroids[counter] = tfidf[key].copy()
        clusters[counter] = {}
        del copy[key]
    #this dictionary is used to record clusters during last iteration, it will be used to compare to new clusters to judge if clusters changed
    oldClusters = {}
    #I will do at most 100 iterations, if clusters no longer change before 100 iterations, break the loop
    for counter2 in range(1, 101):
        #for each vector, find its closest centroid, put it into the according cluster
        for key in tfidf:
            counter = 1
            #for each centroid, calculte the euclidean distance between it and the vector, find the centroid with shortest eucliden distance
            for centroid in centroids:
                #for the first centroid, simply treat it as closest for now
                if counter == 1:
                    closestDistance = 0
                    for attribute in tfidf[key]:
                        closestDistance = closestDistance + (
                            tfidf[key].get(attribute, 0) -
                            centroids[centroid].get(attribute, 0))**2
                    closestDistance = math.sqrt(closestDistance)
                    closestCentroid = centroid
                #for other centroids, check if their euclidean distance is shorter than temporary closest, if yes, replace closest centroid
                else:
                    temp = 0
                    for attribute in tfidf[key]:
                        temp = temp + (
                            tfidf[key].get(attribute, 0) -
                            centroids[centroid].get(attribute, 0))**2
                    temp = math.sqrt(temp)
                    if temp < closestDistance:
                        closestDistance = temp
                        closestCentroid = centroid
                counter = counter + 1
            #put the url of document into new cluster, the value of the dictionary doesn't matter
            clusters[closestCentroid][key] = 1
        #after finishing calculating new clusters, clean old centroids
        centroids = {}
        #for each cluster, calculate the new centroid
        for cluster in clusters:
            docNumber = 0
            #initialize nested dictionary for each cluster's centroid
            centroids[cluster] = {}
            #for every vector in a cluster, add them togeter
            for document in clusters[cluster]:
                docNumber = docNumber + 1
                for term in allWords:
                    centroids[cluster][term] = centroids[cluster].get(
                        term, 0) + tfidf[document].get(term, 0)
            #if the cluster is not empty, divide the sum of all vectors by the number of vectors, the result is the new centroid
            if docNumber == 0:
                newCentroid = random.choice(tfidf.keys())
                centroids[cluster] = tfidf[newCentroid].copy()
            else:
                for term in allWords:
                    centroids[cluster][term] = float(
                        centroids[cluster][term]) / docNumber
        #if this is not the first iteration, compare new clusters with old clusters, if they are completely same, break the loop
        if counter2 != 1:
            completelySame = True
            for cluster in clusters:
                for document in clusters[cluster]:
                    if oldClusters[cluster].has_key(document) == False:
                        completelySame = False
                        break
            if completelySame == True:
                break
        #copy every new cluster, compare them with newer clusters of next iteration
        for cluster in clusters:
            oldClusters[cluster] = clusters[cluster].copy()
            clusters[cluster].clear()
    #after finishing clustering,print every cluster,three terms with highest tfidf and every website of that cluster
    counter2 = 1
    for key in clusters:
        #print three terms with highest tfidf, treat them as labels
        if len(clusters[key]) != 0:
            print "cluster", counter2
            print "(3 words with highest tf*idf (labels): "
            for counter in range(1, 4):
                if not bool(centroids[key]):
                    break
                highestTFIDF = max(centroids[key].items(),
                                   key=lambda x: x[1])[0]
                print highestTFIDF.encode('ascii', 'ignore')
                del centroids[key][highestTFIDF]
            print ")<br>"
            counter2 = counter2 + 1
        #for each cluster, print all websites
        #If the file has an HTML TITLE list the title
        #If the file has no title but the body begins with an HTML H1, H2, or H3, list that
        #Otherwise use the first three words of the text
        for key2 in clusters[key]:
            soup = BeautifulSoup(urllib2.urlopen(key2))
            print "<a href=\"" + key2 + "\">"
            if soup.find("title"):
                print soup.find("title").string.encode('ascii',
                                                       'ignore') + "</a><br>"
            elif soup.find("h1"):
                print soup.find("h1").string.encode('ascii',
                                                    'ignore') + "</a><br>"
            elif soup.find("h2"):
                print soup.find("h2").string.encode('ascii',
                                                    'ignore') + "</a><br>"
            elif soup.find("h3"):
                print soup.find("h3").string.encode('ascii',
                                                    'ignore') + "</a><br>"
            else:
                text = soup.get_text().encode("utf-8")
                array = text.split()
                print array[0], " ", array[1], " ", array[2] + "</a><br>"
예제 #36
0
    def search_query_with_relevance_feedback(self,
                                             query,
                                             feedback_qrels,
                                             num_returns=50,
                                             add_num=1):
        query_text = query["description"]
        print(query_text)
        query_text = " ".join(tokenizer.tokenize(query_text))
        query_text = self.remove_stopwords(query_text.lower())
        print(query_text)
        query_number = query["Number"]
        qrel_doc_ids = [
            qrel["docno"] for qrel in feedback_qrels
            if qrel["qid"] == query_number
        ]
        final_list = []
        term_tf_idf = {}
        doc_count = len(qrel_doc_ids)
        for qrel_doc_id in qrel_doc_ids:
            initial_hit = self.feedback_searcher.search(
                TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs
            if len(initial_hit) == 0:
                continue
            assert len(initial_hit) == 1
            termVector = self.reader.getTermVector(initial_hit[0].doc, "text")
            terms_enum = termVector.iterator()
            termsref = BytesRefIterator.cast_(terms_enum)
            N_terms = 0
            term_idf = {}
            term_freq = {}
            term_list = []
            while (termsref.next()):
                termval = TermsEnum.cast_(termsref)
                termText = termval.term().utf8ToString()
                if termText in self.stopwords:
                    continue
                tc = termval.totalTermFreq()
                if termText in term_freq:
                    term_freq[termText] += tc
                else:
                    term_freq[termText] = tc
                if termText in term_idf:
                    term_idf[termText] += 1
                else:
                    term_idf[termText] = 1
                if termText not in term_list:
                    term_list.append(termText)
                N_terms = N_terms + 1

            for term in term_list:
                if term in term_tf_idf:
                    term_tf_idf[term] += term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))
                else:
                    term_tf_idf[term] = term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))

        sorted_tf_idf = sorted(term_tf_idf.items(),
                               key=lambda x: x[1],
                               reverse=True)
        for each in sorted_tf_idf:
            if each[0] not in self.stopwords and not str(each[0]).isnumeric(
            ) and each[0] not in query_text.split(" "):
                final_list.append(each[0])
        print(final_list[:add_num])
        query_text = query_text + " " + " ".join(final_list[:add_num])
        query_text = " ".join(query_text.split(" "))
        print(query_text)
        query_search = self.parser.parse(query_text)
        results = self.searcher.search(query_search, num_returns)
        hits = results.scoreDocs
        trec_results = []
        for rank, hit in enumerate(hits):
            doc = self.searcher.doc(hit.doc)
            trec_result = {
                "QueryID": query["Number"],
                "Q0": "Q0",
                "DocID": doc.get(".U"),
                "Rank": str(rank + 1),
                "Score": str(hit.score),
                "RunID": self.similarity
            }
            trec_results.append(trec_result)
        return trec_results
예제 #37
0
def getSilhouette(reader, searcher, analyzer, searchTerm, K):
    #get user's query and initiate some dictionaries
    command = searchTerm
    #for each word, record how many matched documents contains it
    termOccurrence = {}
    #inversed document frequency for each word
    idf = {}
    #term frequency for each word in each matched document
    tf = {}
    #the product of tf and idf for each word in each matched document
    tfidf = {}
    #the number of words for each matched document
    docLength = {}
    #for each kind of word in each document,record how many times it appeared in the document
    termOccurrenceInADoc = {}
    #record every kind of word in matched documents as key, the value of this dictionary doesn't matter
    allWords = {}

    #parse the user query
    query = QueryParser("contents", analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs
    #the total number of matched documents
    totalDocs = 0

    #for each matched document,calculate its all term's term normalized term frequency
    for scoreDoc in scoreDocs:
        totalDocs = totalDocs + 1
        #get url and term vectors of the matched document
        doc = searcher.doc(scoreDoc.doc)
        vectors = reader.getTermVector(scoreDoc.doc, "contents")
        enum = vectors.iterator()
        url = doc.get("url")
        #record the url of matched document, create a nested dictionary for term occurence of a word in a document
        termOccurrenceInADoc[url] = {}
        #record the url of matched document, create a nested dictionary for term frequency
        tf[url] = {}
        #for each term in matched document, record each term's occurence and calculate the total word count of the document
        for term in BytesRefIterator.cast_(enum):
            term2 = term.utf8ToString()
            #record the term
            allWords[term2] = 1
            #increase the termOccurence by one
            if termOccurrence.has_key(term2):
                termOccurrence[term2] = termOccurrence[term2] + 1
            else:
                termOccurrence[term2] = 1
            dpEnum = enum.postings(None)
            dpEnum.nextDoc()
            #get occurence of the term
            freq = dpEnum.freq()
            #add the occurence of the term into the total word count of the document
            if docLength.has_key(url):
                docLength[url] = docLength[url] + freq
            else:
                docLength[url] = freq
            #record occurence of this term in the according docuement
            termOccurrenceInADoc[url][term2] = freq
        #for each term in each document,divide its occurence by the total length of the document to get the normalized term frequency
        for key in termOccurrenceInADoc[url]:
            tf[url][key] = termOccurrenceInADoc[url][key] / float(
                docLength[url])
    #calculate idf for each term
    for key in termOccurrence:
        idf[key] = math.log(float(totalDocs) / termOccurrence[key], 10)
    #calculate the product of tf and idf for each term in each document
    for key in tf:
        tfidf[key] = {}
        for word in allWords:
            if tf[key].has_key(word):
                tfidf[key][word] = tf[key][word] * idf[word]
    k = K
    #this is used to record the attributes of all centroids
    centroids = {}
    #this is used to record all urls of each cluster
    clusters = {}
    #copy the tfidf dictionary, because I need to delete some keys to make sure I choose different initial centroids
    copy = tfidf.copy()
    #randomly choose k vectors as initial centroid
    for counter in range(1, k + 1):
        if len(copy) == 0:
            break
        key = random.choice(copy.keys())
        centroids[counter] = tfidf[key].copy()
        clusters[counter] = {}
        del copy[key]
    #this dictionary is used to record clusters during last iteration, it will be used to compare to new clusters to judge if clusters changed
    oldClusters = {}
    #I will do at most 100 iterations, if clusters no longer change before 100 iterations, break the loop
    for counter2 in range(1, 101):
        #for each vector, find its closest centroid, put it into the according cluster
        for key in tfidf:
            counter = 1
            #for each centroid, calculte the euclidean distance between it and the vector, find the centroid with shortest eucliden distance
            for centroid in centroids:
                #for the first centroid, simply treat it as closest for now
                if counter == 1:
                    closestDistance = 0
                    for attribute in tfidf[key]:
                        closestDistance = closestDistance + (
                            tfidf[key].get(attribute, 0) -
                            centroids[centroid].get(attribute, 0))**2
                    closestDistance = math.sqrt(closestDistance)
                    closestCentroid = centroid
                #for other centroids, check if their euclidean distance is shorter than temporary closest, if yes, replace closest centroid
                else:
                    temp = 0
                    for attribute in tfidf[key]:
                        temp = temp + (
                            tfidf[key].get(attribute, 0) -
                            centroids[centroid].get(attribute, 0))**2
                    temp = math.sqrt(temp)
                    if temp < closestDistance:
                        closestDistance = temp
                        closestCentroid = centroid
                counter = counter + 1
            #put the url of document into new cluster, the value of the dictionary doesn't matter
            clusters[closestCentroid][key] = 1
        #after finishing calculating new clusters, clean old centroids
        centroids = {}
        #for each cluster, calculate the new centroid
        for cluster in clusters:
            docNumber = 0
            #initialize nested dictionary for each cluster's centroid
            centroids[cluster] = {}
            #for every vector in a cluster, add them togeter
            for document in clusters[cluster]:
                docNumber = docNumber + 1
                for term in allWords:
                    centroids[cluster][term] = centroids[cluster].get(
                        term, 0) + tfidf[document].get(term, 0)
            #if the cluster is not empty, divide the sum of all vectors by the number of vectors, the result is the new centroid
            if docNumber == 0:
                newCentroid = random.choice(tfidf.keys())
                centroids[cluster] = tfidf[newCentroid].copy()
            else:
                for term in allWords:
                    centroids[cluster][term] = float(
                        centroids[cluster][term]) / docNumber
        #if this is not the first iteration, compare new clusters with old clusters, if they are completely same, break the loop
        if counter2 != 1:
            completelySame = True
            for cluster in clusters:
                for document in clusters[cluster]:
                    if oldClusters[cluster].has_key(document) == False:
                        completelySame = False
                        break
            if completelySame == True:
                break
        #copy every new cluster, compare them with newer clusters of next iteration
        for cluster in clusters:
            oldClusters[cluster] = clusters[cluster].copy()
            clusters[cluster].clear()
    #calculate average silhouette coefficient of all vectors
    silhouette = 0
    numberOfDocuments = 0
    #if no document found or only one cluster is there, treat silhouette as 0
    if len(clusters) <= 1:
        return 0
    averageSI = 0
    counter3 = 0
    #for each vector,calculate the average euclidean distance between it and other vectors in the same cluster (I call it ai)
    #and calculate the lowest average euclidean distance between it and other vectors in another cluster (I call it lowestBi)
    #then use ai and lowestBi to calculate silhouette coefficient for each vector
    for key in clusters:
        if len(clusters[key]) != 0:
            for key2 in clusters[key]:
                si = 0
                #if there is only one vector in this cluster, treat this vector's silhouette coefficient as 0
                if len(clusters[key]) == 1:
                    si = 0
                else:
                    ai = 0
                    bi = {}
                    biCounter = {}
                    counter1 = 0
                    #for other vectors, if they are in the current vector's cluster, add euclidean distance into ai
                    #if they are in other cluster, find the cluster, add euclidean distance(value) and cluster(key) into bi
                    for otherVector in tfidf:
                        if otherVector != key2:
                            if clusters[key].has_key(otherVector):
                                euclidean = 0
                                for attribute in tfidf[otherVector]:
                                    euclidean = euclidean + (
                                        tfidf[otherVector].get(attribute, 0) -
                                        tfidf[key2].get(attribute, 0))**2
                                euclidean = math.sqrt(euclidean)
                                ai = ai + euclidean
                                counter1 = counter1 + 1
                            else:
                                thisCluster = 0
                                #find which cluster this vector belongs to
                                for key3 in clusters:
                                    if clusters[key3].has_key(otherVector):
                                        thisCluster = key3
                                        break
                                euclidean = 0
                                for attribute in tfidf[otherVector]:
                                    euclidean = euclidean + (
                                        tfidf[otherVector].get(attribute, 0) -
                                        tfidf[key2].get(attribute, 0))**2
                                euclidean = math.sqrt(euclidean)
                                bi[thisCluster] = bi.get(thisCluster,
                                                         0) + euclidean
                                biCounter[thisCluster] = biCounter.get(
                                    thisCluster, 0) + 1
                    ai = ai / float(counter1)
                    lowestBi = 0
                    counter4 = 1
                    #find the lowest average euclidean distance between this vector and other clusters
                    for key3 in bi:
                        bi[key3] = bi[key3] / float(biCounter[key3])
                        if counter4 == 1:
                            lowestBi = bi[key3]
                        else:
                            if bi[key3] < lowestBi:
                                lowestBi = bi[key3]
                        counter4 = counter4 + 1
                    #now we have ai and lowest bi, calculate the silhouette coefficient of this vector
                    if ai == 0 or lowestBi == 0:
                        si = 0
                    else:
                        if ai < lowestBi:
                            si = 1 - ai / lowestBi
                        elif ai == lowestBi:
                            si = 0
                        else:
                            si = lowestBi / ai - 1
                averageSI = averageSI + si
                counter3 = counter3 + 1
    #calculate the average silhouette coefficient of all vectors
    averageSI = averageSI / float(counter3)
    return averageSI
예제 #38
0
 def get_terms(self, docid, field):
     terms = self.reader.getTermVector(docid, field)
     te_itr = terms.iterator()
     return [brf.utf8ToString() for brf in BytesRefIterator.cast_(te_itr)]
예제 #39
0
    def search_query(self,
                     query,
                     num_returns=50,
                     use_multipass_pseudo_relevance_feedback=False,
                     doc_counts=None,
                     add_nums=None):

        query_text = query["description"]
        print(query_text.lower())
        query_text = " ".join(tokenizer.tokenize(query_text))
        query_text = self.remove_stopwords(query_text.lower())
        print(query_text)
        query_search = self.parser.parse(query_text)
        if use_multipass_pseudo_relevance_feedback:
            if doc_counts is None:
                doc_counts = [5, 9]
            if add_nums is None:
                add_nums = [2, 13]
            assert len(doc_counts) == len(
                add_nums), "The number of pass is inconsistent!"
            for doc_count, add_num in zip(doc_counts, add_nums):
                final_list = []
                initial_hits = self.searcher.search(query_search,
                                                    doc_count).scoreDocs
                term_tf_idf = {}
                for initial_hit in initial_hits:
                    termVector = self.reader.getTermVector(
                        initial_hit.doc, "text")
                    terms_enum = termVector.iterator()
                    termsref = BytesRefIterator.cast_(terms_enum)
                    N_terms = 0
                    term_idf = {}
                    term_freq = {}
                    term_list = []
                    while (termsref.next()):
                        termval = TermsEnum.cast_(termsref)
                        termText = termval.term().utf8ToString()
                        if termText in self.stopwords:
                            continue
                        tc = termval.totalTermFreq()
                        if termText in term_freq:
                            term_freq[termText] += tc
                        else:
                            term_freq[termText] = tc
                        if termText in term_idf:
                            term_idf[termText] += 1
                        else:
                            term_idf[termText] = 1
                        if termText not in term_list:
                            term_list.append(termText)
                        N_terms = N_terms + 1

                    for term in term_list:
                        if term in term_tf_idf:
                            term_tf_idf[term] += term_freq[term] / N_terms * (
                                1 + math.log(doc_count / (term_idf[term] + 1)))
                        else:
                            term_tf_idf[term] = term_freq[term] / N_terms * (
                                1 + math.log(doc_count / (term_idf[term] + 1)))
                sorted_term_tf_idf = sorted(term_tf_idf.items(),
                                            key=lambda x: x[1],
                                            reverse=True)
                for each in sorted_term_tf_idf:
                    if each[0] not in self.stopwords:
                        final_list.append(each[0])
                print("added query tokens:", final_list[:add_num])
                query_text = query_text + " " + " ".join(final_list[:add_num])
                query_search = self.parser.parse(query_text)
        results = self.searcher.search(query_search, num_returns)
        hits = results.scoreDocs
        trec_results = []
        for rank, hit in enumerate(hits):
            doc = self.searcher.doc(hit.doc)
            trec_result = {
                "QueryID":
                query["Number"],
                "Q0":
                "Q0",
                "DocID":
                doc.get(".U"),
                "Rank":
                str(rank + 1),
                "Score":
                str(hit.score),
                "RunID":
                self.similarity + "-mpprf-" + str(len(doc_counts)) + "passes"
                if use_multipass_pseudo_relevance_feedback else self.similarity
            }
            trec_results.append(trec_result)
        return trec_results