示例#1
0
 def set_tokens(self):
     input = FileInOut()
     dictionary = input.readDic()
     M = len(dictionary)
     print("M :" + str(M))
     T = 755440
     return T, M
 def __init__(self):
     self.g = Group()
     self.input = FileInOut()
     self.classes = self.input.readClasses("KNN")
     # self.classes = self.input.readClasses("NB")
     self.docVectorList, self.vectorsIds = self.input.readDocsVector()
     self.wordFormer = FormWords()
 def __init__(self):
     self.input = FileInOut()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.dictionary = dict()
     self.posting_list = np.array([dict() for j in range(150000)])
     self.dicIndex = 0
     self.docIndex = 0
     self.c = 0
class SimilarNews():
    def __init__(self):
        self.inOut = FileInOut()
        self.clusters = self.inOut.readClusters()
        self.g = Group()
        self.similarity = Similiarity()
        self.v, self.d = self.inOut.readDocsVector()

    def minusDocs(self, doc1, doc2):
        if not doc1:
            return []
        return list(set(doc1) - set(doc2))

    def findSimilarNews(self,query, no_news):
        queryProcess = QueryProcByCluster()
        docs , positions, nearestCentroids = queryProcess.processQueryByCluster(query, no_news)
        if docs:
            docIds, positionsIds = self.g.out_group_of_file(docs, positions)
            c = 0
            relatedDocs = []
            for doc in docIds:
                t2 = Time(doc)
                c += 1
                if doc in self.clusters[nearestCentroids[0]]:
                    similarities = []
                    for d in self.minusDocs(self.clusters[nearestCentroids[0]], docs[0]):
                        t1 = Time(d+1)
                        if t1.year == t2.year and t1.month == t2.month and abs(t1.day - t2.day) < 3 and d not in relatedDocs:
                            index = self.d.index(d)
                            index_doc = self.d.index(doc)
                            similarities.append(self.similarity.compute_similarity(self.v[index], self.v[index_doc]))
                    if similarities:
                        maximum = np.max(similarities)
                        docindex = similarities.index(maximum)
                        relatedDocs.append(docindex)
                    if len(relatedDocs) == 5:
                        break
                else:
                    similarities = []
                    for d in self.minusDocs(self.clusters[nearestCentroids[1]], docs[0]):
                        t1 = Time(d+1)
                        if t1.year == t2.year and t1.month == t2.month and abs(t1.day - t2.day) < 3 and d not in relatedDocs:
                            index = self.d.index(d)
                            index_doc = self.d.index(doc)
                            # docVec = self.v[index]
                            similarities.append(self.similarity.compute_similarity(self.v[index], self.v[index_doc]))
                    if similarities:
                        maximum = np.max(similarities)
                        docindex = similarities.index(maximum)
                        relatedDocs.append(docindex)
                    if len(relatedDocs) == 5:
                        break
            return docs, positions, relatedDocs
        else:return {0:[]} , {0:[]}, []
示例#5
0
 def set_cf_dictionary(self):
     input = FileInOut()
     postings = input.readPostingList()
     cfis = {}
     for i in range(len(postings) - 1):
         cfis[i] = 0
         for j in range(len(postings[i]) - 1):
             cfis[i] += len(postings[i][j])
     self.cfDic = sorted(cfis.items(),
                         key=lambda cfis: cfis[1],
                         reverse=True)
     cfis.clear()
示例#6
0
 def __init__(self):
     self.input = FileInOut()
     self.Dic = self.input.readDic()
     self.DocID_file = self.input.readDocID()
     self.posting_file = self.input.readPostingList()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.relatedDocs = []
     self.notRelatedDocs = []
     self.relatedDocsPos = []
     self.notRelatedDocsPos = []
     self.notRelatedCounts = 0
示例#7
0
 def __init__(self):
     self.inOut = FileInOut()
     self.df = dict()
     v, d = self.inOut.readDocsVector()
     for i in range(1, 38729):
         for j in v:
             if i in j.keys():
                 self.df.setdefault(str(i), []).append(j[i])
             else:
                 self.df.setdefault(str(i), []).append(0)
     self.df = pd.DataFrame(self.df)
     self.df.index=d
     print('phase 1 completed')
 def __init__(self, algorithm):
     self.train_data = Train_data()
     print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     self.input = FileInOut()
     self.k = 5
     # self.train_data = self.input.N
     self.docVectorList, self.vectorsIds = self.input.readDocsVector()
     print("222222222222222222222")
     self.trainVectorList, self.trainvectorsIds = self.input.readTrainDocsVector(
     )
     print("33333333333333333333333333333")
     self.num_ov_results = 100
     self.gp = Group([7745])
     print("ta ghable knn umaaaaaaad")
     self.classes = self.KNN()
示例#9
0
class Kmean:
    def __init__(self):
        self.inOut = FileInOut()
        self.df = dict()
        v, d = self.inOut.readDocsVector()
        for i in range(1, 38729):
            for j in v:
                if i in j.keys():
                    self.df.setdefault(str(i), []).append(j[i])
                else:
                    self.df.setdefault(str(i), []).append(0)
        self.df = pd.DataFrame(self.df)
        self.df.index=d
        print('phase 1 completed')
        # print(self.df.head())
        # self. df = pd.DataFrame({
        #     '1': [12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72],
        #     '2': [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24]
        # })

    def similarity(self, centroids, k):
        for i in range(k):
            d = (self.df.sub(centroids.iloc[i, :]) ** 2).sum(axis=1)
            # d = d ** 2
            # powSum = d.sum(axis=1)
            self.df['distance_from_{}'.format(i)] = (
                np.sqrt(d)
            )
        centroid_distance_cols = ['distance_from_{}'.format(i) for i in range(k)]
        self.df['closest'] = self.df.loc[:, centroid_distance_cols].idxmin(axis=1)
        self.df['closest'] = self.df['closest'].map(lambda x: int(x.lstrip('distance_from_')))
        return self.df

    def updateCentroids(self, centroids, k):
        for i in range(k):
            # if len(self.df.loc[self.df['closest'] == i]) > 0:
            centroids.iloc[i] = self.df.loc[self.df['closest'] == i, [str(l) for l in range(1,38729)]].mean()#no.features 38729
        return centroids

    def cluster(self, k):
        # centroids = {
        #     i + 1: self.df.loc[np.random.randint(0,18)] for i in range(k)
        # }
        # centroids = self.df.ix[np.random.sample(self.df.index, k)]
        centroids = self.df.sample(n = k)
        centroids.index = range(k)
        # np.random.seed(200)
        # centroids = pd.DataFrame({
        #     str(i): [np.random.randint(0, 80) for i in range(k)]
        #     for i in range(1,3) #no.feature38729
        # })
        self.similarity(centroids, k)
        print('sim1')
        a = 0
        while True:
            a += 1
            closest_centroids = self.df['closest'].copy(deep=True)
            centroids= self.updateCentroids(centroids,k)
            print('update')
            self.similarity(centroids, k)
            if closest_centroids.equals(self.df['closest']) or a == 10:
                break
        dist = self.RSSmeasure(centroids, self.df)

        finalcenters = {str(i):{j+1: list(centroids.loc[i, :])[j] for j in range(0, 38728)} for i in range(k)}#no feature-1
        # print(finalcenters)
        self.inOut.writeCentroids(finalcenters, k)
        finalCluster = {str(i): list(self.df.index[self.df['closest'] == i]) for i in range(k)}
        # print(finalCluster)
        self.inOut.writeClusters(finalCluster, k)
        # colmap = {1: 'r', 2: 'g', 3: 'b', 4: 'y', 5:'black', 0:'brown'}
        # fig = plt.figure(figsize=(5, 5))
        # l = []
        # print(self.df)
        # for d in self.df['closest']:
        #     l.append(colmap[d])
        # plt.scatter(self.df['1'], self.df['2'], color=l, alpha=0.5, edgecolor='k')
        # for i in range(k):
        #     plt.scatter(*centroids[i], color=colmap[i])
        # plt.xlim(0, 80)
        # plt.ylim(0, 80)
        # plt.show()
        print(dist)
        return dist

    def RSSmeasure(self, centroids, df):
        dist = 0
        for i in range(len(self.df.index)):#no. doc
            dist += self.df.iloc[i]['distance_from_'+str(int(self.df.iloc[i]['closest']))] ** 2
        return dist
class Classifier:
    def __init__(self, algorithm):
        self.train_data = Train_data()
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        self.input = FileInOut()
        self.k = 5
        # self.train_data = self.input.N
        self.docVectorList, self.vectorsIds = self.input.readDocsVector()
        print("222222222222222222222")
        self.trainVectorList, self.trainvectorsIds = self.input.readTrainDocsVector(
        )
        print("33333333333333333333333333333")
        self.num_ov_results = 100
        self.gp = Group([7745])
        print("ta ghable knn umaaaaaaad")
        self.classes = self.KNN()
        # self.classes = self.NB()
        # self.classes = self.input.readClasses(algorithm)

    def KNN(self):
        classes = {
            "science": [],
            "cultureart": [],
            "politics": [],
            "economy": [],
            "social": [],
            "international": [],
            "sport": [],
            "multimedia": []
        }

        for key in classes.keys():
            classes[key].append([])
        print("for size : " + str(len(self.docVectorList)))
        for j in range(len(self.docVectorList)):
            # for j in range(2):
            print("j: " + str(j))
            kbest = []
            for t in range(len(self.trainVectorList)):
                similarity = self.compute_similarity(self.docVectorList[j],
                                                     self.trainVectorList[t])
                if len(kbest) < self.k:
                    kbest.append([
                        similarity,
                        self.train_data.get_cat(self.trainvectorsIds[t])
                    ])
                else:
                    minimum = min(kbest, key=lambda x: x[0])
                    if similarity > minimum[0]:
                        kbest[kbest.index(minimum)] = [
                            similarity,
                            self.train_data.get_cat(self.trainvectorsIds[t])
                        ]
            cat = [Counter(col).most_common(1)[0][0] for col in zip(*kbest)][1]
            fnum, id = self.gp.pack_id(self.vectorsIds[j])
            classes[cat][fnum].append(id)
        # self.input.writeClasses(classes, "KNN")
        writeClassesF0()
        return classes

    def NB(self):
        classes = {
            "science": [],
            "cultureart": [],
            "politics": [],
            "economy": [],
            "social": [],
            "international": [],
            "sport": [],
            "multimedia": []
        }
        print("for size : " + str(len(self.docVectorList)))
        class_tf, nci, tf_tid = self.get_classes_tf()
        for key in classes.keys():
            classes[key].append([])
        for j in range(len(self.docVectorList)):
            # print("j: " + str(j))
            cat = self.determine_category(self.docVectorList[j], class_tf, nci,
                                          tf_tid)
            fnum, id = self.gp.pack_id(self.vectorsIds[j])
            classes[cat][fnum].append(id)
        self.input.writeClasses(classes, "NB")
        return classes

    def get_classes_tf(self):
        class_tf = {
            "science": 0,
            "cultureart": 0,
            "politics": 0,
            "economy": 0,
            "social": 0,
            "international": 0,
            "sport": 0,
            "multimedia": 0
        }
        nci = {
            "science": 0,
            "cultureart": 0,
            "politics": 0,
            "economy": 0,
            "social": 0,
            "international": 0,
            "sport": 0,
            "multimedia": 0
        }
        tf_tid = {
            "science": {},
            "cultureart": {},
            "politics": {},
            "economy": {},
            "social": {},
            "international": {},
            "sport": {},
            "multimedia": {}
        }
        for t in range(len(self.trainVectorList)):
            td_cat = self.train_data.get_cat(self.trainvectorsIds[t])
            for tid in self.trainVectorList[t].keys():
                if tf_tid[td_cat].get(tid, None) is None:
                    tf_tid[td_cat][tid] = self.trainVectorList[t][tid]
                else:
                    tf_tid[td_cat][tid] += self.trainVectorList[t][tid]
            nci[td_cat] += 1
            # alpha = 1
            class_tf[td_cat] += sum(self.trainVectorList[t].values()
                                    ) + 1 * len(self.trainVectorList[t])
        print(
            "LLLLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFF"
        )
        print(tf_tid)
        return class_tf, nci, tf_tid

    def determine_category(self, docVector, class_tf, nci, tf_tid):
        c_score = {
            "science": 0,
            "cultureart": 0,
            "politics": 0,
            "economy": 0,
            "social": 0,
            "international": 0,
            "sport": 0,
            "multimedia": 0
        }
        for cat in c_score.keys():
            c_score[cat] += math.log10(nci[cat] / 1000)
            for tid in docVector.keys():
                if tf_tid[cat].get(tid, None) is None:
                    c_score[cat] += math.log10((1) / class_tf[cat])
                else:
                    c_score[cat] += math.log10(
                        (tf_tid[cat][tid] + 1) / class_tf[cat])
        print("score for each class:")
        print(c_score)
        determined_cat = max(c_score.items(), key=operator.itemgetter(1))[0]
        return determined_cat

    def process_cat(self, query):
        cat_inq = find(query, "cat")
        category = cat_inq[0].split(":")[1]
        query = query.replace(cat_inq[0], '')
        q1 = QueryProc()
        notEliminated = query.replace("!", "")
        docList, indexList = q1.processQueryBySimilarity(notEliminated)
        doc_dic, index_dic = self.gp.grouping_by_file(docList, indexList)
        for key in doc_dic.keys():
            for docId in doc_dic[key]:
                if not docId in self.classes[category][key]:
                    to_remove = doc_dic[key].index(docId)
                    doc_dic[key].pop(to_remove)
                    index_dic[key].pop(to_remove)
        max_heap = self.make_heap(doc_dic, index_dic, query)
        return self.getKbest(max_heap, self.num_ov_results)

    def make_heap(self, doc_dic, index_dic, query):
        maxHeap = MaxHeap()
        queryVector = self.compute_query_wieght(
            Similiarity.get_query_termList(query))
        for key in doc_dic.keys():
            for i in range(len(doc_dic[key])):
                # if docList[i]==7744:
                #     continue
                tot_did = self.gp.unpacking_index(doc_dic[key][i], key)
                k = self.vectorsIds.index(tot_did)
                similarity = self.compute_similarity(queryVector,
                                                     self.docVectorList[k])
                if not Similiarity.is_similsrity_zero(similarity):
                    maxHeap.insert(
                        DocNode(tot_did, index_dic[k][i], similarity))
        return maxHeap

    def getKbest(self, maxHeap, k):
        docList = []
        indexList = []
        for i in range(k):
            docNode = maxHeap.extractMax()
            if docNode is None:
                break
            docList.append(docNode.docId)
            indexList.append(docNode.indexList)
        doc_ids, indexes = self.gp.grouping_by_file(docList, indexList)
        return doc_ids, indexes

    def compute_similarity(self, query_vector, doc_vector):
        sum = 0
        for term_id in query_vector.keys():
            sum += query_vector.get(term_id) * doc_vector.get(term_id, 0)
        similarity = sum / (self.get_size(query_vector) *
                            self.get_size(doc_vector))
        return similarity

    def get_size(self, vector):
        tfs = vector.values()
        sum = 0
        for tf in tfs:
            sum += pow(tf, 2)
        return math.sqrt(sum)

    def compute_query_wieght(self, termsList):
        dictionary = self.input.readDic()
        docIDs = self.input.readDocID()
        vector = {}
        negative = []
        indices = [i for i, x in enumerate(termsList) if x == '!']
        indices.sort(reverse=True)
        for i in indices:
            negative.append(termsList.pop(i + 1))
            termsList.pop(i)
        for x in termsList:
            term_id = dictionary.index(x) + 1 if x in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = termsList.count(x)
                vector[term_id] = (1 + math.log10(tf)) * math.log10(
                    self.input.N / len(docIDs[term_id]))
                # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
        for y in negative:
            term_id = dictionary.index(y) if y in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = negative.count(y)
                value = (1 + math.log10(tf)) * math.log10(
                    self.input.N / len(docIDs[term_id]))
                # value = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
                vector[term_id] = -1 * value
        return vector
示例#11
0
class Index:
    def __init__(self):
        self.input = FileInOut()
        self.wordFormer = FormWords()
        self.constants = ConstantVars()
        self.dictionary = dict()
        self.posting_list = np.array([dict() for j in range(150000)])
        self.dicIndex = 0
        self.docIndex = 0
        self.c = 0

    def Filter(self, string, substr):
        return [
            str if not any(sub == str for sub in substr) else '**'
            for str in string
        ]

    def makeDic(self, value, j):
        if value not in self.dictionary.keys() and value != '**':
            # print(self.dicIndex)
            # print(value)
            if '\n' in value:
                pass
            else:
                self.dictionary[value] = 1
                self.input.writeDic([value])
                self.posting_list[self.dicIndex][self.docIndex] = [j]
                self.dicIndex += 1
        elif value in self.dictionary.keys() and value != '**':
            if self.docIndex in self.posting_list[list(
                    self.dictionary.keys()).index(value)].keys():
                self.posting_list[list(self.dictionary.keys()).index(value)][
                    self.docIndex].append(j)
            else:
                self.posting_list[list(
                    self.dictionary.keys()).index(value)][self.docIndex] = [j]

    def indexData(self):
        for n in range(15):
            data = self.input.readData('ir-news-' + str(n) + '.csv')
            for d in data["content"]:
                print(self.docIndex)
                self.docIndex += 1
                d = self.cleanContent(d)
                d = self.wordFormer.normalize(d)
                tokens = self.wordFormer.tokenize(d)
                self.c += len(tokens)
                tokens = list(filter(lambda a: a != '\n', tokens))
                tokens = self.wordFormer.uniform(tokens)
                # postaged_tokens = self.wordFormer.posTagging(tokens)
                stemmed_tokens = self.wordFormer.stemmWords(tokens)
                lemmatized_tokens = self.wordFormer.lemmatizeWords(
                    stemmed_tokens)
                lemmatized_tokens = self.Filter(
                    lemmatized_tokens,
                    self.constants.punctuations() +
                    ['\"', '\"', '!', '', '\n'] + self.constants.StopWords())
                list(
                    map(self.makeDic, lemmatized_tokens,
                        [i for i in range(0, len(lemmatized_tokens))]))
            print('doc' + str(n) + ': ' + str(self.docIndex))
        # for i in range(len(list(self.dictionary.keys()))):
        #     print(i)
        #     print(list(self.dictionary.keys()).pop(i))
        for i in range(0, len(self.posting_list)):
            self.input.writeDocID(self.posting_list[i])
            self.input.writePostingList([
                self.stringmaker(self.posting_list[i][key])
                for key in self.posting_list[i].keys()
            ])
        print('number of tokens')
        print(self.c)
        print(time.time())

    def getRelatedDocs(self, token):
        if token in self.dictionary:
            return self.posting_list[np.where(self.dictionary == token)][0]
        else:
            return {}

    def cleanContent(self, raw):
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleanText = re.sub(cleaner, ' ', raw)
        return cleanText

    def stringmaker(self, list):
        stri = ''
        for i in list:
            stri = stri + str(i) + ' '
        return stri
class QueryProcByCluster:
    def __init__(self):
        self.g = Group()
        self.input = FileInOut()
        self.classes = self.input.readClasses("KNN")
        # self.classes = self.input.readClasses("NB")
        self.docVectorList, self.vectorsIds = self.input.readDocsVector()
        self.wordFormer = FormWords()

    def find(self, s, pat):
        pat = r'(\w*%s\w*)' % pat  # Not thrilled about this line
        return re.findall(pat, s)

    def process_cat(self, query, num_ov_results):
        cat_inq = self.find(query, "cat")
        category = cat_inq[0].split(":")[1]
        query = query.replace(cat_inq[0], '')
        q1 = QueryProc()
        notEliminated = query.replace("!", "")
        docList, indexList = q1.processQueryBySimilarity(notEliminated)
        doc_dic, index_dic = self.g.grouping_by_file(docList, indexList)
        for key in doc_dic.keys():
            for docId in doc_dic[key]:
                if not docId in self.classes[category][key]:
                    to_remove = doc_dic[key].index(docId)
                    doc_dic[key].pop(to_remove)
                    index_dic[key].pop(to_remove)
        max_heap = self.make_heap(doc_dic, index_dic, query)
        return self.getKbest(max_heap, num_ov_results)

    def make_heap(self, doc_dic, index_dic, query):
        maxHeap = MaxHeap()
        simi = Similiarity()
        queryVector = self.compute_query_wieght(simi.get_query_termList(query))
        for key in doc_dic.keys():
            for i in range(len(doc_dic[key])):
                tot_did = self.g.unpacking_index(doc_dic[key][i], key)
                # if tot_did > 7743:
                #     continue
                k = self.vectorsIds.index(tot_did)
                similarity = self.compute_similarity(queryVector, self.docVectorList[k])
                if not simi.is_similsrity_zero(similarity):
                    maxHeap.insert(DocNode(tot_did, index_dic[key][i], similarity))
        return maxHeap

    def getKbest(self, maxHeap, k):
        docList = []
        indexList = []
        for i in range(k):
            docNode = maxHeap.extractMax()
            if docNode is None:
                break
            docList.append(docNode.docId)
            indexList.append(docNode.indexList)
        doc_ids, indexes = self.g.grouping_by_file(docList, indexList)
        return doc_ids, indexes

    def compute_similarity(self, query_vector, doc_vector):
        sum = 0
        for term_id in query_vector.keys():
            sum += query_vector.get(term_id) * doc_vector.get(term_id, 0)
        similarity = sum / (self.get_size(query_vector) * self.get_size(doc_vector))
        return similarity

    def get_size(self, vector):
        tfs = vector.values()
        sum = 0
        for tf in tfs:
            sum += pow(tf, 2)
        return math.sqrt(sum)

    def compute_query_wieght(self, termsList):
        dictionary = self.input.readDic()
        docIDs = self.input.readDocID()
        vector = {}
        negative = []
        indices = [i for i, x in enumerate(termsList) if x == '!']
        indices.sort(reverse=True)
        for i in indices:
            negative.append(termsList.pop(i + 1))
            termsList.pop(i)
        for x in termsList:
            term_id = dictionary.index(x) + 1 if x in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = termsList.count(x)
                vector[term_id] = (1 + math.log10(tf)) * math.log10(self.input.N / len(docIDs[term_id]))
                # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
        for y in negative:
            term_id = dictionary.index(y) if y in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = negative.count(y)
                value = (1 + math.log10(tf)) * math.log10(self.input.N / len(docIDs[term_id]))
                # value = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
                vector[term_id] = -1 * value
        return vector

    def nearestCentroids(self, query):
        centers, labels = self.input.readCenters()
        distances = []
        for center in centers:
            distances.append(self.compute_similarity(query, center))
        maximum = np.max(distances)
        nearestNeabeor = distances.index(maximum)
        distances2 = distances
        distances.remove(maximum)
        maximum2 = np.max(distances)
        if maximum == maximum2:
            nearestNeabeor2 = distances.index(maximum2, nearestNeabeor+1)
        else:
            nearestNeabeor2 = distances.index(maximum2)
        # distances2.remove(minimum2)
        return [str(nearestNeabeor) , str(nearestNeabeor2)]

    def mergeDocs(self, doc1, doc2, index1):
        if not doc1:
            return [], []
        # print('type')
        # print(doc1)
        # print(type(doc1))
        withOutQoute = list(set(doc2) - set(doc1))
        doc2 = list(set(doc2))
        for d in withOutQoute:
            doc2.remove(d)
        positions = []
        for d in doc2:
            positions.append(index1[doc1.index(d)])
        return doc2, positions

    def processQueryByCluster(self, query, num_ov_results):
        cat = False
        if "cat" in query:
            cat_inq = self.find(query, "cat:")
            category = cat_inq[0].split(":")[1]
            query = query.replace(cat_inq[0], '')
            cat = True
        q1 = QueryProc()
        docList, indexList = q1.processQueryBySimilarity(query)
        term_list = Similiarity.get_query_termList(query)
        queryVec = self.compute_query_wieght(term_list)
        if queryVec == {}:
            return {0:[]} , {0:[]}, []
        nearestCentroids = self.nearestCentroids(queryVec)
        print('center')
        print(nearestCentroids)
        docs = []
        for neabors in nearestCentroids:
            a = list(self.input.readClusters()[neabors])
            docs = docs + a
        print('doclist')
        print(docList)
        mergeDocs, positions  = self.mergeDocs(docList, docs, indexList)
        if not mergeDocs:
            return {0:[]} , {0:[]}, nearestCentroids
        doc_dic, index_dic = self.g.grouping_by_file(mergeDocs, positions)
        print("boolean cat fffffffffffff")
        print(cat)
        if cat:
            # result_doc_dic = {}
            # result_indexes = {}
            # print("cat tttttttttttt")
            # print(category)
            # print("Class kkkkkkkkkkkkk")
            # print(self.classes)
            # if category.lower() in self.classes.keys():
                for key in doc_dic.keys():
                    # result_doc_dic[key] = []
                    # result_indexes[key] = []
                    print("qabl hazffffff: "+str(len(doc_dic[key])))
                    print(doc_dic[key])
                    print("category.lower()")
                    print(category.lower())
                    # print(self.classes[category.lower()][key])
                    to_remove = []
                    for t in range(len(doc_dic[key])):###doc_dic[key][t]=docid
                        # print("docidll : "+str(doc_dic[key][t]))
                        if doc_dic[key][t] not in self.classes[category.lower()][key]:
                            # print(str(doc_dic[key][t])+" not in fffffffffffffffff ")
                            # in_cat = doc_dic[key].index(docId)
                            # result_doc_dic[key].append(doc_dic[key].pop(in_cat))
                            # result_indexes[key].append(index_dic[key].pop(in_cat))
                            # doc_dic[key].pop(in_cat)
                            to_remove.append(t)
                            # print(doc_dic[key].pop(t))
                            # index_dic[key].pop(t)
                    to_remove.sort(reverse=True)
                    for f in range(len(to_remove)):
                        index_dic[key].pop(to_remove[f])
                        doc_dic[key].pop(to_remove[f])
                    print("bad hazffffff: " + str(len(doc_dic[key])))
                    print(doc_dic)
        max_heap = self.make_heap(doc_dic, index_dic, query)
        kbest1, kbest2 = self.getKbest(max_heap, num_ov_results)
        return kbest1, kbest2, nearestCentroids
示例#13
0
 def __init__(self):
     self.inOut = FileInOut()
     self.clusters = self.inOut.readClusters()
     self.g = Group()
     self.similarity = Similiarity()
     self.v, self.d = self.inOut.readDocsVector()
 def __init__(self):
     self.input = FileInOut()
     self.N = self.input.N
class Similiarity:
    def __init__(self):
        self.input = FileInOut()
        self.N = self.input.N
        # self.docVectorList, self.vectorsIds = self.input.readpDocsVector()

    def get_size(self, vector):
        tfs = vector.values()
        sum = 0
        for tf in tfs:
            sum += pow(tf, 2)
        return math.sqrt(sum)

    def compute_similarity(self, query_vector, doc_vector):
        sum = 0
        for term_id in query_vector.keys():
            sum += query_vector.get(term_id) * doc_vector.get(term_id, 0)
        similarity = sum / (self.get_size(query_vector) *
                            self.get_size(doc_vector))
        return similarity

    def get_index(self, doc_vectors, value):
        for x in doc_vectors:
            if x.docId == value:
                return doc_vectors.index(x)
        return -1
        # doc = next((x for x in doc_vectors if x.docid == value), None)
        # return doc_vectors.index(doc) if doc != None else -1

    @staticmethod
    def get_query_termList(query):
        wordFormer = FormWords()
        constants = ConstantVars()
        query = wordFormer.normalize(query)
        query_tokens = wordFormer.tokenize(query)
        for token in query_tokens:
            if token in constants.punctuations(
            ) or token in constants.StopWords():
                query_tokens.remove(token)
        query_tokens = wordFormer.uniform(query_tokens)
        # postaged_tokens = wordFormer.posTagging(query_tokens)
        stemmed_tokens = wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens)

        lemmatized_tokens = list(filter(lambda a: a != '"', lemmatized_tokens))
        return lemmatized_tokens

    def compute_query_wieght(self, termsList):
        dictionary = self.input.readDic()
        docIDs = self.input.readDocID()
        vector = {}
        negative = []
        indices = [i for i, x in enumerate(termsList) if x == '!']
        indices.sort(reverse=True)
        for i in indices:
            negative.append(termsList.pop(i + 1))
            termsList.pop(i)
        for x in termsList:
            term_id = dictionary.index(x) + 1 if x in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = termsList.count(x)
                vector[term_id] = (1 + math.log10(tf)) * math.log10(
                    self.N / len(docIDs[term_id]))
                # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
        for y in negative:
            term_id = dictionary.index(y) if y in dictionary else -1
            if term_id != -1 and vector.get(term_id) is None:
                tf = negative.count(y)
                value = (1 + math.log10(tf)) * math.log10(
                    self.N / len(docIDs[term_id]))
                # value = weighting_scheme2_query(len(docIDs[term_id]), self.N)
                # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N)
                vector[term_id] = -1 * value
        return vector

    def compute_docs_wieghts(self):
        docIDs = self.input.readDocID()
        postings = self.input.readPostingList()
        doc_vectors = []
        for i in range(115148):
            print(i)
            for j in range(len(docIDs[i]) - 1):
                index = self.get_index(doc_vectors, docIDs[i][j])
                if index == -1:
                    doc_vectors.append(DocumentVector(docIDs[i][j]))
                    index = self.get_index(doc_vectors, docIDs[i][j])
                doc_vectors[index].fill_vector(i + 1, len(postings[i][j]),
                                               len(docIDs[i]), self.N,
                                               docIDs[i])
        doc_vectors.sort(key=lambda x: int(x.docId))
        self.input.writepDocsVector(doc_vectors)
        return doc_vectors

    def process_query(self, query, k):
        q1 = QueryProc()
        notEliminated = query.replace("!", "")
        docList, indexList = q1.processQueryBySimilarity(notEliminated)
        max_heap = self.make_heap(docList, indexList, query)
        return self.getKbest(max_heap, k)

    def make_heap(self, docList, indexList, query):
        maxHeap = MaxHeap()
        queryVector = self.compute_query_wieght(self.get_query_termList(query))
        for i in range(len(docList)):
            if docList[i] == 7744:
                continue
            k = self.vectorsIds.index(docList[i])
            similarity = self.compute_similarity(queryVector,
                                                 self.docVectorList[k])
            if not self.is_similsrity_zero(similarity):
                maxHeap.insert(DocNode(docList[i], indexList[i], similarity))

        return maxHeap

    @staticmethod
    def is_similsrity_zero(similarity):
        return similarity == 0.0

    def getKbest(self, maxHeap, k):
        simsum = 0
        docList = []
        indexList = []
        for i in range(k):
            docNode = maxHeap.extractMax()
            if docNode is None:
                break
            simsum += docNode.similarity
            docList.append(docNode.docId)
            indexList.append(docNode.indexList)
        return docList, indexList
示例#16
0
class QueryProc:
    def __init__(self):
        self.input = FileInOut()
        self.Dic = self.input.readDic()
        self.DocID_file = self.input.readDocID()
        self.posting_file = self.input.readPostingList()
        self.wordFormer = FormWords()
        self.constants = ConstantVars()
        self.relatedDocs = []
        self.notRelatedDocs = []
        self.relatedDocsPos = []
        self.notRelatedDocsPos = []
        self.notRelatedCounts = 0

    def initializing(self, query):
        print(query)
        query = self.wordFormer.normalize(query)
        print(query)
        query_tokens = self.wordFormer.tokenize(query)
        for token in query_tokens:
            if token in self.constants.punctuations() or token in self.constants.StopWords():
                query_tokens.remove(token)
        query_tokens = self.wordFormer.uniform(query_tokens)
        # postaged_tokens = self.wordFormer.posTagging(query_tokens)
        stemmed_tokens = self.wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens)
        i = j = 0
        k = 0
        not_include = False
        order = False
        orderTokens = [[] for i in range(5)]
        for token in lemmatized_tokens:
            print(token)
            if token == "«" and order == False:
                print('first')
                k += 1
                order = True
                continue
            if token == "»" and order == True:
                print('second')
                order = False
                continue
            if order:
                orderTokens[k - 1].append(token)
                continue
            if token == "!":
                not_include = True
                self.notRelatedCounts += 1
                continue
            if not_include:
                self.notRelatedDocs.append(self.getRelatedSavedDocs(token))
                self.notRelatedDocsPos.append(self.getRelatedSavedpos(token))
                not_include = False
            print('order')
            print(order)
            if not not_include and not order:
                print('hahahaha')
                self.relatedDocs.append(self.getRelatedSavedDocs(token))
                self.relatedDocsPos.append(self.getRelatedSavedpos(token))

            # related_result, relatedPos = self.merge(self.relatedDocs, i)
        related_result = []
        relatedPos = []
        for res in range(len(self.relatedDocs)):
            related_result = related_result + self.relatedDocs[res]
            relatedPos = relatedPos + self.relatedDocsPos[res]
        related_result = list(set(related_result))
        relatedPos = relatedPos[:len(related_result)]
        docs = []
        doc_pos = []
        j = 0
        if related_result != []:
            docs.append(related_result)
            doc_pos.append(relatedPos)
            j += 1
        for i in range(0, k):
            phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
            docs.append(phrase_container)
            doc_pos.append(phrase_pos)
            j += 1
        final_result, final_pos = self.finalMerge(docs, doc_pos, j)

        relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos)
                # i += 1
        return relateds_and_not_unrelateds,related_position

    def merge_common_docs(self, common_list, docList1, docList2, indexList1, indexList2):
        for doc in common_list:
            i1 = docList1.index(doc)
            i2 = docList2.index(doc)
            docList2.pop(i2)
            indexList1[i1] = indexList1[i1] + indexList2.pop(i2)
        indexList1 = indexList1 + indexList2
        docList1 = docList1 + docList2
        return indexList1, docList1

    def similarity_merge(self, docLists, indexLists):
        if len(docLists) == 0:
            return None, None
        docs = docLists.pop(0)
        indexes = list(filter(lambda n: n != [], indexLists.pop(0)))
        if len(docLists) == 0:
            return docs, indexes
        for doc in docLists:
            i = docLists.index(doc)
            doci = docLists.pop(i)
            dociPos = list(filter(lambda n: n != [], indexLists.pop(i)))
            common = list(set(doci) & set(docs))
            indexes, docs = self.merge_common_docs(common, docs, doci, indexes, dociPos)
        return docs, indexes

    def processQueryBySimilarity(self, query):
        print('queryyy')
        print(query)
        docList, indexList = self.initializing(query)
        # related_result, related_pos = self.relatedDocs, self.relatedDocsPos
        # j = 0
        # if related_result != []:
        #     j += 1
        # for i in range(0, k):
        #     phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
        #     related_result.append(phrase_container)
        #     related_pos.append(phrase_pos)
        #     j += 1
        # relateds_and_not_unrelateds, related_position = self.finalMerge(related_result, related_pos, j)
        # # relateds_and_not_unrelateds, related_position = self.similarity_merge(related_result, related_pos)
        # docList, indexList = self.notMerge(relateds_and_not_unrelateds, related_position)
        return docList, indexList

    def processQuery(self, query):
        query = self.wordFormer.normalize(query)
        query_tokens = self.wordFormer.tokenize(query)
        for token in query_tokens:
            if token in self.constants.punctuations() or token in self.constants.StopWords():
                query_tokens.remove(token)
        query_tokens = self.wordFormer.uniform(query_tokens)
        # postaged_tokens = self.wordFormer.posTagging(query_tokens)
        stemmed_tokens = self.wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens)
        i = j = 0
        k = 0
        not_include = False
        order = False
        orderTokens = [[] for i in range(5)]
        for token in lemmatized_tokens:
            if token == "\"" and order == False:
                k += 1
                order = True
                continue
            if token == "\"" and order == True:
                order = False
                continue
            if order:
                orderTokens[k - 1].append(token)
                continue
            if token == "!":
                not_include = True
                self.notRelatedCounts += 1
                continue
            if not_include:
                self.notRelatedDocs.append(self.getRelatedSavedDocs(token))
                self.notRelatedDocsPos.append(self.getRelatedSavedpos(token))
                not_include = False
            else:
                self.relatedDocs.append(self.getRelatedSavedDocs(token))
                self.relatedDocsPos.append(self.getRelatedSavedpos(token))
                i += 1
        # print('related docs')
        # print(self.relatedDocs)
        related_result, relatedPos = self.merge(self.relatedDocs, i)
        docs = []
        doc_pos = []
        j = 0
        if related_result != []:
            docs.append(related_result)
            doc_pos.append(relatedPos)
            j += 1
        for i in range(0, k):
            phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
            docs.append(phrase_container)
            doc_pos.append(phrase_pos)
            j += 1
        final_result, final_pos = self.finalMerge(docs, doc_pos, j)
        # print("self.notRelatedCounts")
        # print(self.notRelatedCounts)
        # print('no relate')
        # print(self.notRelatedDocs)
        relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos)
        # for i in range(len(related_pos)):
        #     related_pos[i] = related_pos[i]
        # print(relateds_and_not_unrelateds)
        # print(related_position)
        return relateds_and_not_unrelateds, related_position

    def merge(self, docs, leng):
        answer = []
        postingAns = []
        if leng == 0:
            return [], []
        elif leng == 1:
            return docs[0], self.relatedDocsPos[0]
        else:
            p2 = docs[0]
            postings2 = []
            for j in range(len(p2)):
                postings2.append(self.relatedDocsPos[0][j])
            i = 1
            while i < leng:
                p1 = docs[i]
                postings1 = []
                for j in range(len(p1)):
                    postings1.append(self.relatedDocsPos[i][j])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        postingAns.append(postings1[0] + postings2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        postings1.remove(postings1[0])
                        postings2.remove(postings2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        postings1.remove(postings1[0])
                    else:
                        p2.remove(p2[0])
                        postings2.remove(postings2[0])
                p2 = answer
                postings2 = postingAns
        print('docc')
        print(answer)
        print(postingAns)
        return answer, postingAns

    def finalMerge(self, docs, docPos, length):
        answer = []
        docPosAns = []
        if length == 0:
            return [], []
        elif length == 1:
            return list(docs[0]), list(docPos[0])
        else:
            p2 = list(docs[0])
            docPos2 = list(docPos[0])
            i = 1
            while i < length:
                p1 = list(docs[i])
                docPos1 = list(docPos[i])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        docPosAns.append(docPos1[0] + docPos2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        docPos1.remove(docPos1[0])
                        docPos2.remove(docPos2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        docPos1.remove(docPos1[0])
                    else:
                        p2.remove(p2[0])
                        docPos2.remove(docPos2[0])
                p2 = answer
                docPos2 = docPosAns
        # print('docc and double quote')
        # print(answer)
        # print(docPosAns)
        return answer, docPosAns

    def notMerge(self, relatedDocs, relatedPos):
        print('no relate')
        print(self.notRelatedDocs)
        answer = []
        postingAns = []
        if self.notRelatedCounts == 0:
            if len(relatedDocs) != 0:
                return relatedDocs, list(relatedPos)
            else:
                return [], []
        else:
            p1 = relatedDocs
            posting1 = relatedPos
            i = 0
            while i < self.notRelatedCounts:
                p2 = self.notRelatedDocs[i]
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                        p2.remove(p2[0])
                    elif p1[0] < p2[0]:
                        answer.append(p1[0])
                        postingAns.append(posting1[0])
                        posting1.remove(posting1[0])
                        p1.remove(p1[0])
                    else:
                        p2.remove(p2[0])
        for p in p1:
            answer.append(p)
        for posting in posting1:
            postingAns.append(posting)
        print('finall docc')
        return answer, postingAns

    def phraseContainerDocs(self, pharase):
        # to numbers of pharase length
        docs = []
        docsPos = []
        for p in pharase:
            docs.append(self.getRelatedSavedDocs(p))
            docsPos.append(self.getRelatedSavedpos(p))
        answer = []
        answer_posting = [[] for k in range(50)]
        length = len(docs)
        if length == 0:
            return [], []
        elif length == 1:
            # print(docs[0])
            return docs[0], docsPos[0]
        else:
            p2 = docs[0]
            posting2 = docsPos[0]
            i = 1
            while i < len(pharase):
                index = -1
                answer = []
                answer_posting = [[] for k in range(50)]
                p1 = docs[i]
                posting1 = docsPos[i]
                i += 1
                while (p1 != [] and p2 != []):
                    if p1[0] == p2[0]:
                        for posting in posting2[0]:
                            if (posting + 1) in posting1[0]:
                                if p1[0] not in answer:
                                    answer.append(p1[0])
                                    index += 1
                                    answer_posting[index].append(posting + 1)
                        # print({p1[0] : docs[i - 1][p1[0]]})
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        posting1.remove(posting1[0])
                        posting2.remove(posting2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                    else:
                        p2.remove(p2[0])
                        posting2.remove(posting2[0])
                p2 = answer
                # print('ans')
                # print(answer)
                # print(answer_posting)
                posting2 = answer_posting
        # print('double qoute')
        # print(answer)
        # print(answer_posting)
        return answer, answer_posting

    def getRelatedSavedDocs(self, token):
        i = 0
        if token in self.Dic:
            # print(self.Dic.index(token))
            posting = list(map(int, self.DocID_file[self.Dic.index(token)]))
            i += 1
            print(posting)
            return posting
        return []

    def getRelatedSavedpos(self, token):
        i = 0
        if token in self.Dic:
            # print(self.Dic.index(token))
            posting = [list(map(int, self.posting_file[self.Dic.index(token)][j].split(' '))) for j in
                       range(len(self.posting_file[self.Dic.index(token)]))]
            i += 1
            return posting
        return []