def readFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                # self.documents[file]=open(self.path+'/'+file,'r').read()
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        tree = javalang.parse.parse(
                            self.documents[file]['content'])
                    except (javalang.parser.JavaSyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.fileIndex[file] = {}
                    names = []  # self defined name
                    self.lastLineNo = 0
                    self.index(tree, file, names, {}, {}, False)
                    # print(self.fileIndex[file])
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.fileIndex, f, True)
Exemplo n.º 2
0
    def ReadFiles(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        for file in self.files:  # go through the folder
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    try:
                        root = ast.parse(str(self.documents[file]['content']))
                    except (SyntaxError):
                        self.lw.write_error_log("syntax error! " + file)
                        continue
                    # remove strings and variable names
                    self.visitor.visit(root)
                    self.lineNums[file] = {}
                    self.hashDic[file] = {}
                    self.Indexing(root, self.lineNums[file], self.weights,
                                  file)
                else:
                    self.documents.pop(file)
        self.files = list(self.documents.keys())

        self.lw.write_info_log("get " + str(len(self.documents)) +
                               " documents")
        # use pickle module to save data into file 'CodexIndexAST.pik'
        with open(self.index_path, 'wb') as f:
            pickle.dump(self.weights, f, True)
            pickle.dump(self.lineNums, f, True)
            pickle.dump(self.hashDic, f, True)
Exemplo n.º 3
0
    def import_in(self, filename):
        dic = conv.to_dic(file_name=filename)

        # return  self.compareQueries(dic['code'],q1)

        # compare if two queries are the same using hash functions
        def compareQueries(self, query1, query2):
            h1 = self.nodeToHash(query1)
            h2 = self.nodeToHash(query2)
            return h1 == h2

        # parse a query
        def nodeToHash(self, node):
            qRoot = ast.parse(node)
            self.visitor.visit(qRoot)
            qt = ast.dump(qRoot)
            m = hashlib.md5()
            m.update(qt.encode("utf8"))
            h = m.hexdigest()
            return h
 def import_in(self, filename):
     dic = conv.to_dic(file_name=filename)
     print(dic['content'])
Exemplo n.º 5
0
    def indexing(self):
        self.lw.write_info_log("reading files...")
        self.files = os.listdir(self.path)  # get all the file names
        if '.DS_Store' in self.files:
            self.files.remove('.DS_Store')
        fs = len(self.files)
        self.tfidf = TfidfVectorizer()
        i = 0
        while i < fs:  # go through the folder
            file = self.files[i]
            if not os.path.isdir(file):  # judge if it is a folder
                self.documents[file] = conv.to_dic(self.path + "/" + file)
                if len(self.documents[file]['content'].strip()) > 0:
                    self.contents.append(self.documents[file]['content'])
                    # store the line numbers of the term
                    self.lineNo[file] = {}
                    j = 0
                    for line in self.documents[file]['content'].split('\n'):
                        lineList = [line]
                        if len(lineList) > 0:
                            try:
                                self.tfidf.fit_transform(lineList)  # get the unique standard term of this line
                            except ValueError:
                                j += 1
                                continue
                            for term in self.tfidf.vocabulary_:
                                if term in self.lineNo[file]:
                                    self.lineNo[file][term].append(j)
                                else:
                                    self.lineNo[file][term] = [j]
                        j += 1
                    i += 1
                else:
                    self.documents.pop(file)
                    self.files.remove(file)
                    fs -= 1
            else:
                self.files.remove(file)
        print('finish reading')
        # self.files = list(self.documents.keys())
        size = len(self.documents)
        self.lw.write_info_log("get " + str(size) + " documents")
        self.lw.write_info_log("indexing...")
        self.re = self.tfidf.fit_transform(self.contents).toarray().T  # tf-idf values
        self.idf = self.tfidf.idf_
        self.word = self.word = list(self.tfidf.vocabulary_.keys())

        # compression matrix
        self.re = dok_matrix(self.re)
        # self.X=dok_matrix(self.X)
        print("start SVD")
        # svd decomposition
        self.u, self.s, self.d = svds(self.re, k=1000)
        print('start dumping')
        # store the index into the pickle
        with open(self.index_path, 'wb')as f:  # use pickle module to save data into file 'CodexIndex.pik'
            pickle.dump(self.s, f, True)
            pickle.dump(self.u, f, True)
            pickle.dump(self.d, f, True)
            pickle.dump(self.tfidf, f, True)
            pickle.dump(self.lineNo, f, True)
            print('finish')