def __init__(self):
     self.g = Group()
     self.input = FileInOut()
     self.classes = self.input.readClasses("KNN")
     # self.classes = self.input.readClasses("NB")
     self.docVectorList, self.vectorsIds = self.input.readDocsVector()
     self.wordFormer = FormWords()
示例#2
0
 def set_tokens(self):
     input = FileInOut()
     dictionary = input.readDic()
     M = len(dictionary)
     print("M :" + str(M))
     T = 755440
     return T, M
 def __init__(self):
     self.input = FileInOut()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.dictionary = dict()
     self.posting_list = np.array([dict() for j in range(150000)])
     self.dicIndex = 0
     self.docIndex = 0
     self.c = 0
示例#4
0
 def set_cf_dictionary(self):
     input = FileInOut()
     postings = input.readPostingList()
     cfis = {}
     for i in range(len(postings) - 1):
         cfis[i] = 0
         for j in range(len(postings[i]) - 1):
             cfis[i] += len(postings[i][j])
     self.cfDic = sorted(cfis.items(),
                         key=lambda cfis: cfis[1],
                         reverse=True)
     cfis.clear()
示例#5
0
 def __init__(self):
     self.input = FileInOut()
     self.Dic = self.input.readDic()
     self.DocID_file = self.input.readDocID()
     self.posting_file = self.input.readPostingList()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.relatedDocs = []
     self.notRelatedDocs = []
     self.relatedDocsPos = []
     self.notRelatedDocsPos = []
     self.notRelatedCounts = 0
示例#6
0
 def __init__(self):
     self.inOut = FileInOut()
     self.df = dict()
     v, d = self.inOut.readDocsVector()
     for i in range(1, 38729):
         for j in v:
             if i in j.keys():
                 self.df.setdefault(str(i), []).append(j[i])
             else:
                 self.df.setdefault(str(i), []).append(0)
     self.df = pd.DataFrame(self.df)
     self.df.index=d
     print('phase 1 completed')
 def __init__(self, algorithm):
     self.train_data = Train_data()
     print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     self.input = FileInOut()
     self.k = 5
     # self.train_data = self.input.N
     self.docVectorList, self.vectorsIds = self.input.readDocsVector()
     print("222222222222222222222")
     self.trainVectorList, self.trainvectorsIds = self.input.readTrainDocsVector(
     )
     print("33333333333333333333333333333")
     self.num_ov_results = 100
     self.gp = Group([7745])
     print("ta ghable knn umaaaaaaad")
     self.classes = self.KNN()
 def __init__(self):
     self.inOut = FileInOut()
     self.clusters = self.inOut.readClusters()
     self.g = Group()
     self.similarity = Similiarity()
     self.v, self.d = self.inOut.readDocsVector()
 def __init__(self):
     self.input = FileInOut()
     self.N = self.input.N