Exemplo n.º 1
0
 def getsents(self, type="java"):
     self.logger.info("getsents")
     import json
     # 存储 diff1的语料处理后的模型
     if not os.path.exists(
             os.path.join(GetFilePathRoot.get_root_dir(), "data5")):
         os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data5"))
     # 记录原始文件和模型提取结果的映射
     self.sha_name_file = os.path.join(GetFilePathRoot.get_root_dir(),
                                       "data2", "extract_sha_name_csv")
     i = 0
     self.logger.info(i)
     for name, file in self.getfile(type):
         i += 1
         self.logger.info((i, name))
         sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n")
         ## 进行处理
         dealsents = self.getpredeal(sents=sents)
         result = self.getexcate(dealsents)
         # 保存的文件 csv
         with open(self.sha_name_file, "a") as f:
             ff = csv.writer(f)
             ff.writerow([self.getsha(name), name])
             f.close()
         if os.path.exists(
                 os.path.join(GetFilePathRoot.get_root_dir(), "data5",
                              self.getsha(name) + ".json")):
             continue
         with open(
                 os.path.join(GetFilePathRoot.get_root_dir(), "data5",
                              self.getsha(name)) + ".json", "w") as dump_f:
             json.dump(result, dump_f)
             dump_f.close()
     self.log(ExtractModeTask.EXTRACT)
     return
Exemplo n.º 2
0
 def loaddiff(self):
     corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
     ##中文目录乱码
     corpus_root = unicode(corpus_root, "GB2312")
     self.logger.info(corpus_root)
     pattern_1 = r".*/diff1/.*\.txt"
     self.logger.info("加载语料库 lazyload")
     self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                   pattern_1)
     self.logger.info("加载语料库 完毕")
Exemplo n.º 3
0
 def getsents(self,type="java"):
     self.logger.info("getsents")
     if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3")):
         os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(),"data3"))
     self.sha_name_file=os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus_sha_name_csv")
     i=0
     self.logger.info(i)
     for name,file in self.getfile(type):
         i+=1
         self.logger.info((i,name))
         sents= file.replace("\r\n","\n").replace("\r","\n").split("\n")
         result = self.getpredeal(sents=sents)
         with open(self.sha_name_file, "a") as f:
             ff = csv.writer(f)
             ff.writerow([self.getsha(name), name])
             f.close()
         if os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name))):
             continue
         with open(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name)),"w") as f:
             f.write(result)
             f.close()
     self.log(DealCorpusTask.JAVA_SANIT)
Exemplo n.º 4
0
 def initlog(self):
     if not os.path.exists(
             os.path.join(GetFilePathRoot.get_root_dir(), "data2")):
         os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data2"))
     self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2",
                                    "CorpusOutput.txt")
     import logging
     # 创建一个logger
     self.logger = logging.getLogger("corpus")
     self.logger.setLevel(logging.DEBUG)
     # 创建一个handler,用于写入日志文件
     fh = logging.FileHandler(self.outputfile)
     fh.setLevel(logging.DEBUG)
     # 再创建一个handler,用于输出到控制台
     ch = logging.StreamHandler()
     ch.setLevel(logging.DEBUG)
     # 定义handler的输出格式
     formatter = logging.Formatter(
         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     fh.setFormatter(formatter)
     ch.setFormatter(formatter)
     # 给logger添加handler
     self.logger.addHandler(fh)
     self.logger.addHandler(ch)
Exemplo n.º 5
0
 def mergecorpus(self,type="java"):
     if type =="java":
         rootDir=os.path.join(GetFilePathRoot.get_root_dir(),"data3")
         targetfile=self.merge_corpus_java_file
     i = 0
     num=len(os.listdir(rootDir))
     k = open(targetfile, 'w')
     for lists in os.listdir(rootDir):
         path = os.path.join(rootDir, lists)
         i+=1
         self.logger.info((float(i)/num,path))
         with open(path,"r") as f:
             # print f.read()
             k.write(f.read()+"\n")
     k.close()
     self.log(DealCorpusTask.JAVA_MERGE)
     pass
Exemplo n.º 6
0
    def loadcorpus(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)

        pattern_1 = r".*/diff1/.*\.txt"
        pattern_2 = r".*/diff2/.*\.txt"
        pattern_3 = r".*/diff3/.*\.txt"
        from nltk.corpus.util import LazyCorpusLoader
        from nltk.corpus import PlaintextCorpusReader
        self.logger.info("加载语料库")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_1)
        self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_2)
        self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_3)
        self.logger.info("加载完毕")
Exemplo n.º 7
0
    def initlog(self):
        self.logfile =  os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus.txt")
        print self.logfile
        self.java_dict =False
        self.java_model=False
        self.java_pic= False
        self.java_predeal =False
        self.java_sanit =False
        self.xml_dict = False
        self.xml_model = False
        self.xml_pic = False
        self.xml_predeal = False
        self.xml_sanit = False
        self.log(None)
        self.outputfile =  os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpusOutput.txt")
        if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(), "data4")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4"))
        self.merge_corpus_java_file = os.path.join(GetFilePathRoot.get_root_dir(), "data4", "merge_corpus_java.txt")
        self.dict_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","java_dict_corpus.txt")
        self.model_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","model_corpus_java")


        import logging
        # 创建一个logger
        self.logger = logging.getLogger("DealCorpus")
        self.logger.setLevel(logging.DEBUG)
        # 创建一个handler,用于写入日志文件
        fh = logging.FileHandler(self.outputfile)
        fh.setLevel(logging.DEBUG)
        # 再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        # 定义handler的输出格式
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        # 给logger添加handler
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)
Exemplo n.º 8
0
    def initlog(self):
        self.logfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2",
                                    "ExtractModel.txt")
        print self.logfile

        self.merge = False
        self.extract = False

        self.start_prefix = "##### start"
        self.change_prefix = "###### change :"
        self.name_prefix = "###### name :"
        self.end_prefix = "##### end"
        self.model_corpus_java_file = os.path.join(
            GetFilePathRoot.get_root_dir(), "data4", "model_corpus_java")
        modelfile = self.model_corpus_java_file
        self.model = word2vec.Word2Vec.load(modelfile)

        self.log(None)
        self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2",
                                       "ExtractModelOutput.txt")
        if not os.path.exists(
                os.path.join(GetFilePathRoot.get_root_dir(), "data4")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4"))
        self.merge_extract_java_file = os.path.join(
            GetFilePathRoot.get_root_dir(), "data4", "merge_extract_java.txt")

        import logging
        # 创建一个logger
        self.logger = logging.getLogger("DealCorpus")
        self.logger.setLevel(logging.DEBUG)
        # 创建一个handler,用于写入日志文件
        fh = logging.FileHandler(self.outputfile)
        fh.setLevel(logging.DEBUG)
        # 再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        # 定义handler的输出格式
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        # 给logger添加handler
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)