示例#1
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print knbc.fileids()[:10]
    print ''.join( knbc.words()[:100] )

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] )
示例#2
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print knbc.fileids()[:10]
    print ''.join( knbc.words()[:100] )

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] )
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp")

    print knbc.fileids()[:10]
    print "".join(knbc.words()[:100])

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
示例#4
0
def parse_wsj(processes=8):
    ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions
        'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg',
        cat_file='allcats.txt', tagset='wsj')

    fileids = ptb.fileids()
    params = []
    for f in fileids:
        corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f))
        for i, (parsed, tagged) in enumerate(corpus):
            params.append((f, i, parsed, tagged))

    p = Pool(processes)
    p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
示例#5
0
文件: knbc.py 项目: zlpmichelle/nltk
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f
        for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader(
        "knbc/corpus1",
        KNBCorpusReader,
        sorted(fileids, key=_knbc_fileids_sort),
        encoding="euc-jp",
    )

    print(knbc.fileids()[:10])
    print("".join(knbc.words()[:100]))

    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))

    print(
        "\n".join(
            " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )
示例#6
0
文件: DealCorpus.py 项目: LM1201/ABA
class DealCorpus:
    count = 0
    def __init__(self):
        '初始化爬取所需要的信息'
        DealCorpus.count += 1
        # 初始化 使用的项目 GithubRepo
        self.initlog()
        self.loaddiff()
    def loaddiff(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)
        pattern_1 = r".*/diff1/.*\.txt"
        pattern_2 = r".*/diff2/.*\.txt"
        pattern_3 = r".*/diff3/.*\.txt"

        self.logger.info("加载语料库 lazyload")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1)
        self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2)
        self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3)
        self.logger.info("加载语料库 完毕")
    def log(self,type):
        import json
        if not os.path.exists(self.logfile):
            load_dict={'java_sanit': False,'java_merge':False,'java_dict':False,'java_model':False,
                       'xml_sanit': False, 'xml_merge': False, 'xml_dict': False, 'xml_model': False,}
            logging.info(load_dict)
            with open(self.logfile, "w") as dump_f:
                json.dump(load_dict, dump_f)
                dump_f.close()
        with open(self.logfile, 'r') as load_f:
            load_dict = json.load(load_f)
            print load_dict
            load_f.close()
            if type == DealCorpusTask.JAVA_DICT:
                load_dict['java_dict'] = True
            if type == DealCorpusTask.JAVA_MODEL:
                load_dict['java_model'] = True

            if type == DealCorpusTask.JAVA_MERGE:
                load_dict['java_merge'] = True
            if type == DealCorpusTask.JAVA_SANIT:
                load_dict['java_sanit'] = True

            if type == DealCorpusTask.XML_DICT:
                load_dict['xml_dict'] = True
            if type == DealCorpusTask.XML_MODEL:
                load_dict['xml_model'] = True

            if type == DealCorpusTask.XML_MERGE:
                load_dict['xml_merge'] = True
            if type == DealCorpusTask.XML_SANIT:
                load_dict['xml_sanit'] = True

            logging.info(load_dict)
            self.java_dict = load_dict['java_dict']
            self.java_model = load_dict['java_model']

            self.java_predeal = load_dict['java_merge']
            self.java_sanit = load_dict['java_sanit']

            self.xml_dict = load_dict['xml_dict']
            self.xml_model = load_dict['xml_model']

            self.xml_predeal = load_dict['xml_merge']
            self.xml_sanit = load_dict['xml_sanit']

        # print "++++++++++++++++++++++++++++",load_dict
        with open(self.logfile, "w") as dump_f:
            json.dump(load_dict, dump_f)
            dump_f.close()

    def initlog(self):
        self.logfile =  os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus.txt")
        print self.logfile
        self.java_dict =False
        self.java_model=False
        self.java_pic= False
        self.java_predeal =False
        self.java_sanit =False
        self.xml_dict = False
        self.xml_model = False
        self.xml_pic = False
        self.xml_predeal = False
        self.xml_sanit = False
        self.log(None)
        self.outputfile =  os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpusOutput.txt")
        if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(), "data4")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4"))
        self.merge_corpus_java_file = os.path.join(GetFilePathRoot.get_root_dir(), "data4", "merge_corpus_java.txt")
        self.dict_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","java_dict_corpus.txt")
        self.model_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","model_corpus_java")


        import logging
        # 创建一个logger
        self.logger = logging.getLogger("DealCorpus")
        self.logger.setLevel(logging.DEBUG)
        # 创建一个handler,用于写入日志文件
        fh = logging.FileHandler(self.outputfile)
        fh.setLevel(logging.DEBUG)
        # 再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        # 定义handler的输出格式
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        # 给logger添加handler
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)

    def getAll(self):
        # return
        self.logger.info("deal java")
        if not self.java_sanit:
            self.logger.info("extract")
            self.getsents()
        if not self.java_predeal:
            self.logger.info("merge")
            self.mergecorpus()
        if not self.java_dict:
            self.logger.info("java_dict")
            self.getdict()
        if not self.java_model:
            self.logger.info("java_model")
            self.getmodel()


        self.logger.info("deal xml")
        if not self.xml_sanit:
            self.logger.info("xml_sanit")
        if not self.xml_predeal:
            self.logger.info("predeal")
        if not self.xml_dict:
            self.logger.info("xml_dict")
        if not self.xml_model:
            self.logger.info("xml_model")

    def mergecorpus(self,type="java"):
        if type =="java":
            rootDir=os.path.join(GetFilePathRoot.get_root_dir(),"data3")
            targetfile=self.merge_corpus_java_file
        i = 0
        num=len(os.listdir(rootDir))
        k = open(targetfile, 'w')
        for lists in os.listdir(rootDir):
            path = os.path.join(rootDir, lists)
            i+=1
            self.logger.info((float(i)/num,path))
            with open(path,"r") as f:
                # print f.read()
                k.write(f.read()+"\n")
        k.close()
        self.log(DealCorpusTask.JAVA_MERGE)
        pass


    def getdict(self,type="java"):
        if type == "java":
            self.sourcefilename=self.merge_corpus_java_file
            self.targetfilename=self.dict_corpus_java_file
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        sentences = word2vec.Text8Corpus(self.sourcefilename)  # 加载语料
        dictionary = gensim.corpora.Dictionary(sentences)
        print len(dictionary)
        dictionary.save_as_text(self.targetfilename, sort_by_word=True)
        if type=="java":
            self.log(DealCorpusTask.JAVA_DICT)

    def getmodel(self,type="java"):
        if type == "java":
            self.sourcefilename=self.merge_corpus_java_file
            self.targetfilename=self.model_corpus_java_file
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        sentences = word2vec.Text8Corpus(self.sourcefilename)  # 加载语料
        model = word2vec.Word2Vec(sentences, size=200)  # 训练skip-gram模型; 默认window=5
        # 保存模型,以便重用
        model.save(self.targetfilename)
        if type=="java":
            self.log(DealCorpusTask.JAVA_MODEL)
    def getfile(self,type="java"):
        if type == "java":
            fileids=self.diff2.fileids()
            for file in fileids:
                print file
                yield file,self.diff2.raw(file)
        if type == "xml":
            fileids = self.diff3.fileids()
            for file in fileids:
                print file
                yield file,self.diff3.raw(file)
    def getsha(self,string):
        from hashlib import sha1
        ss= sha1(string)
        return  ss.hexdigest()
    def getsents(self,type="java"):
        self.logger.info("getsents")
        if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(),"data3"))
        self.sha_name_file=os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus_sha_name_csv")
        i=0
        self.logger.info(i)
        for name,file in self.getfile(type):
            i+=1
            self.logger.info((i,name))
            sents= file.replace("\r\n","\n").replace("\r","\n").split("\n")
            result = self.getpredeal(sents=sents)
            with open(self.sha_name_file, "a") as f:
                ff = csv.writer(f)
                ff.writerow([self.getsha(name), name])
                f.close()
            if os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name))):
                continue
            with open(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name)),"w") as f:
                f.write(result)
                f.close()
        self.log(DealCorpusTask.JAVA_SANIT)

    def getpredeal(self,sents):
        # 加载停用词表
        newsents=[]
        stop=["+","-","*","/","%","=","!",">","<","&","|","^","~","?","(",")","[","]","{","}",",",";"]
        comment=False
        for line in sents:
            import re
            if re.search("(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", line, re.I):
                line= line.replace(re.search("(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", line, re.I).group(1),"")
                # print line
            if line != u' ':
                newline =None
                if "//" in line:
                    newline = line[0:line.index("//")]
                if "/*" in line:
                    comment =True
                    newline = line[0:line.index("/*")]
                if "*/" in line:
                    newline=line[line.index("*/")+2:len(line) ]
                    # print newline
                    self.logger.info(newline)
                    comment = False
                if "/*"in line and "*/" in line:
                    if line.index("/*")<=line.index("*/"):
                        newline = line[0:line.index("/*")]+line[line.index("*/")+2:len(line)]
                        comment = False
                if comment or newline !=None:
                    pass
                else:
                    newline=line
                if newline !=None:
                    for s in stop:
                        if s in newline:
                            newline=" ".join(newline.split(s))
                    newline= newline.replace("\r\n", " ").replace("\n", " ").replace("\t", " ")
                    newline = " ".join(newline.split(" "))
                    newsents.append(newline)
                newline=None
        result=""
        for line in newsents:
            for word in line.split(" "):
                if word != "":
                    result+=word+" "
        return result


    def gendiffpicture(self, word="activity", type="java", size=3):
        if type=="java":
            modelfile = self.model_corpus_java_file
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        model = word2vec.Word2Vec.load(modelfile)
        G = nx.Graph()
        blacknode = set([])
        def get(startwordlist):
            result = []
            for word in startwordlist:
                y2 = model.most_similar(word,topn=6)  # 6个最相关的
                for item in y2:
                    if not item[0] in blacknode:
                        result.append(item[0])
                        G.add_node(item[0])
                        G.add_weighted_edges_from([(word, item[0], item[1])])
                blacknode.add(word)
            return result

        start = [word]
        i = 0;
        while True:
            i += 1;
            if i == size:
                break
            result = get(start)
            print result
            start = result
        for node in G.nodes():
            print node
        pos = nx.spring_layout(G)
        nx.draw(G, pos=pos, node_color="r", with_labels=True, node_size=900, font_size=10)
        plt.show()

    def getsmailarword(self, word="activity", type="java", number=10):
        '获得相似单词'
        if type == "java":
            modelfile = self.model_corpus_java_file
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        model = word2vec.Word2Vec.load(modelfile)
        return model.most_similar(word, topn=number)
示例#7
0
文件: ExcateMode.py 项目: LM1201/ABA
class ExcateMode:
    count = 0

    def __init__(self):
        '初始化爬取所需要的信息'
        ExcateMode.count += 1
        # 初始化 使用的项目 GithubRepo
        self.initlog()
        self.loaddiff()

    def loaddiff(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)
        pattern_1 = r".*/diff1/.*\.txt"
        self.logger.info("加载语料库 lazyload")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_1)
        self.logger.info("加载语料库 完毕")

    def log(self, type):
        import json
        if not os.path.exists(self.logfile):
            load_dict = {'extract': False, 'merge': False}
            logging.info(load_dict)
            with open(self.logfile, "w") as dump_f:
                json.dump(load_dict, dump_f)
                dump_f.close()
        with open(self.logfile, 'r') as load_f:
            load_dict = json.load(load_f)
            print load_dict
            load_f.close()
            if type == ExtractModeTask.MERGE:
                load_dict['merge'] = True
            if type == ExtractModeTask.EXTRACT:
                load_dict['extract'] = True

            logging.info(load_dict)
            self.merge = load_dict['merge']
            self.extract = load_dict['extract']

        # print "++++++++++++++++++++++++++++",load_dict
        with open(self.logfile, "w") as dump_f:
            json.dump(load_dict, dump_f)
            dump_f.close()

    def initlog(self):
        self.logfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2",
                                    "ExtractModel.txt")
        print self.logfile

        self.merge = False
        self.extract = False

        self.start_prefix = "##### start"
        self.change_prefix = "###### change :"
        self.name_prefix = "###### name :"
        self.end_prefix = "##### end"
        self.model_corpus_java_file = os.path.join(
            GetFilePathRoot.get_root_dir(), "data4", "model_corpus_java")
        modelfile = self.model_corpus_java_file
        self.model = word2vec.Word2Vec.load(modelfile)

        self.log(None)
        self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2",
                                       "ExtractModelOutput.txt")
        if not os.path.exists(
                os.path.join(GetFilePathRoot.get_root_dir(), "data4")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4"))
        self.merge_extract_java_file = os.path.join(
            GetFilePathRoot.get_root_dir(), "data4", "merge_extract_java.txt")

        import logging
        # 创建一个logger
        self.logger = logging.getLogger("DealCorpus")
        self.logger.setLevel(logging.DEBUG)
        # 创建一个handler,用于写入日志文件
        fh = logging.FileHandler(self.outputfile)
        fh.setLevel(logging.DEBUG)
        # 再创建一个handler,用于输出到控制台
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        # 定义handler的输出格式
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        # 给logger添加handler
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)

    def getAll(self):
        # with open("test_1","r") as f:
        #     file= f.read()
        #     sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n")
        #     ## 进行处理
        #     dealsents = self.getpredeal(sents=sents)
        #     all = self.getexcate(dealsents)
        # return
        self.logger.info("deal java")
        if not self.extract:
            self.logger.info("extract")
            self.getsents()

    def getfile(self, type="java"):
        fileids = self.diff1.fileids()
        for file in fileids:
            print file
            yield file, self.diff1.raw(file)

    def getsha(self, string):
        from hashlib import sha1
        ss = sha1(string)
        return ss.hexdigest()

    def getsents(self, type="java"):
        self.logger.info("getsents")
        import json
        # 存储 diff1的语料处理后的模型
        if not os.path.exists(
                os.path.join(GetFilePathRoot.get_root_dir(), "data5")):
            os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data5"))
        # 记录原始文件和模型提取结果的映射
        self.sha_name_file = os.path.join(GetFilePathRoot.get_root_dir(),
                                          "data2", "extract_sha_name_csv")
        i = 0
        self.logger.info(i)
        for name, file in self.getfile(type):
            i += 1
            self.logger.info((i, name))
            sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n")
            ## 进行处理
            dealsents = self.getpredeal(sents=sents)
            result = self.getexcate(dealsents)
            # 保存的文件 csv
            with open(self.sha_name_file, "a") as f:
                ff = csv.writer(f)
                ff.writerow([self.getsha(name), name])
                f.close()
            if os.path.exists(
                    os.path.join(GetFilePathRoot.get_root_dir(), "data5",
                                 self.getsha(name) + ".json")):
                continue
            with open(
                    os.path.join(GetFilePathRoot.get_root_dir(), "data5",
                                 self.getsha(name)) + ".json", "w") as dump_f:
                json.dump(result, dump_f)
                dump_f.close()
        self.log(ExtractModeTask.EXTRACT)
        return

    def getexcate(self, dealsents):
        all = []
        commit = []
        file = []
        change = {'del': [], "add": [], "content": [], "diff": []}
        commitstart = False
        for sent in dealsents:
            if self.start_prefix in sent:
                commitstart = True
            if commitstart == True:
                if self.change_prefix in sent:
                    pass
                elif self.name_prefix in sent:
                    if change != {
                            'del': [],
                            "add": [],
                            "content": [],
                            "diff": []
                    }:
                        file.append(change)
                        commit.append(file)
                        # print file
                        file = []
                        change = {
                            'del': [],
                            "add": [],
                            "content": [],
                            "diff": []
                        }
                elif self.start_prefix in sent:
                    pass
                    # print sent
                elif self.end_prefix in sent:
                    commitstart = False
                    if change != {
                            'del': [],
                            "add": [],
                            "content": [],
                            "diff": []
                    }:
                        file.append(change)
                        commit.append(file)
                        all.append(commit)
                        commit = []
                        file = []
                        change = {
                            'del': [],
                            "add": [],
                            "content": [],
                            "diff": []
                        }
                else:
                    import re
                    if re.search(
                            "(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)",
                            sent, re.I):
                        sent = sent.replace(
                            re.search(
                                "(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)",
                                sent, re.I).group(1), "")
                        if change != {
                                'del': [],
                                "add": [],
                                "content": [],
                                "diff": []
                        }:
                            file.append(change)
                            change = {
                                'del': [],
                                "add": [],
                                "content": [],
                                "diff": []
                            }
                        if sent.replace(" ", "") != "":
                            if sent.startswith("-") and sent.replace(
                                    "-", "").replace(" ", "") != "":
                                change['del'].append(sent)
                            elif sent.startswith("+") and sent.replace(
                                    "+", "").replace(" ", "") != "":
                                change['add'].append(sent)
                            else:
                                if sent.replace("-", "").replace(
                                        " ", "") != "" and sent.replace(
                                            "+", "").replace(" ", "") != "":
                                    change["content"].append(sent)
                    else:
                        if sent.replace(" ", "") != "":
                            if sent.startswith("-") and sent.replace(
                                    "-", "").replace(" ", ""):
                                change['del'].append(sent)
                            elif sent.startswith("+") and sent.replace(
                                    "+", "").replace(" ", ""):
                                change['add'].append(sent)
                            else:
                                if sent.replace("-", "").replace(
                                        " ", "") != "" and sent.replace(
                                            "+", "").replace(" ", "") != "":
                                    change["content"].append(sent)
        for commit in all:
            # print  "aaaaaaa",commit
            for file in commit:
                # print "ffffffff" ,file
                for change in file:
                    # print "cccccccccccc",change
                    # print "            ",change['content']
                    change['diff'] = self.getsmailer(change['add'],
                                                     change['del'])
                    for diff in change['diff']:
                        if diff[1] in change['add']:
                            # print "            remove add"
                            change['add'].remove(diff[1])
                        if diff[2] in change['del']:
                            # print "            remove del"
                            change['del'].remove(diff[2])
                    # print "            ", change['del']
                    # print "            ", change['add']
                    # for diff in  change['diff']:
                    # print "            dddddddd",diff
        return all

    def emd(self, dist, w1, w2):
        import numpy as np
        import rpy2.robjects as robjects
        # 从R中导入lp.transport()
        robjects.r['library']('lpSolve')
        transport = robjects.r['lp.transport']
        """R的transport()函数用来计算EMD"""
        # transport()的参数
        costs = robjects.r['matrix'](robjects.FloatVector(dist),
                                     nrow=len(w1),
                                     ncol=len(w2),
                                     byrow=True)
        row_signs = ["<"] * len(w1)
        row_rhs = robjects.FloatVector(w1)
        col_signs = [">"] * len(w2)
        col_rhs = robjects.FloatVector(w2)
        t = transport(costs, "min", row_signs, row_rhs, col_signs, col_rhs)
        flow = t.rx2('solution')
        dist = dist.reshape(len(w1), len(w2))
        flow = np.array(flow)
        work = np.sum(flow * dist)
        # print "***", (np.sum(flow)),work
        emd = (work + np.float64(2)) / (np.sum(flow) + np.float64(0.1))
        return emd

    def getsentencesmaliar(self, scenceA, scenceB):
        # 使用词向量获得语言相似度,有两个问题,1.时间。2.目前只训练了java
        am = self.model
        import numpy as np
        f1 = scenceA.split()
        f2 = scenceB.split()
        n1 = len(scenceA.split())
        n2 = len(scenceB.split())
        # 创建一个距离矩阵
        dist = np.zeros(n1 * n2)
        for i in range(n1):
            for j in range(n2):
                try:
                    t1 = am.wv[f1[i]]
                except KeyError:
                    continue
                    pass
                try:
                    t2 = am.wv[f2[j]]
                except:
                    continue
                    pass
                dist[i * n2 + j] = self.euclid_dist(t1, t2) + 0.01
        first_signature = np.ones(n1)
        second_signature = np.ones(n2)
        # print  "*****", dist
        return 1.0 / (1 + self.emd(dist, first_signature, second_signature))

    def euclid_dist(self, feature1, feature2):
        """计算欧氏距离"""
        if len(feature1) != len(feature2):
            print "ERROR: calc euclid_dist: %d <=> %d" % (len(feature1),
                                                          len(feature2))
            return -1
        return np.sqrt(np.sum((feature1 - feature2)**2))

    def getsmailer(self, add=[], dele=[]):
        # 时间代价有点高
        result = []
        if len(add) <= len(dele):
            for sa in add:
                minvalue = 0
                temp = ("", "")
                for sd in dele:
                    import difflib
                    seq = difflib.SequenceMatcher(None, sa, sd)
                    ratio = seq.ratio()
                    # print ratio,(nsa,nsd)
                    smailarvalue = ratio
                    if smailarvalue > minvalue:
                        minvalue = smailarvalue
                        temp = (sa, sd)
                if minvalue != 0:
                    result.append((minvalue, temp[0], temp[1]))
                    # print minvalue,self.getsentencesmaliar(temp[2],temp[3]), (temp[0],temp[1])
        elif len(add) > len(dele):
            for sd in dele:
                minvalue = 0
                temp = ("", "")
                for sa in add:
                    import difflib
                    seq = difflib.SequenceMatcher(None, sa, sd)
                    ratio = seq.ratio()
                    # print ratio, (nsa, nsd)
                    smailarvalue = ratio
                    if smailarvalue > minvalue:
                        minvalue = smailarvalue
                        temp = (sa, sd)
                if minvalue != 0:
                    result.append((minvalue, temp[0], temp[1]))
                    # print minvalue, self.getsentencesmaliar(temp[2], temp[3]), (temp[0], temp[1])
        return result

    def getpredeal(self, sents):
        # 加载停用词表
        newsents = []
        comment = False
        for line in sents:
            if line != u' ':
                newline = None
                if "//" in line:
                    newline = line[0:line.index("//")]
                if "/*" in line:
                    comment = True
                    newline = line[0:line.index("/*")]
                if "*/" in line:
                    newline = line[line.index("*/") + 2:len(line)]
                    # print newline
                    self.logger.info(newline)
                    comment = False
                if "/*" in line and "*/" in line:
                    if line.index("/*") <= line.index("*/"):
                        newline = line[0:line.
                                       index("/*")] + line[line.index("*/") +
                                                           2:len(line)]
                        comment = False
                if comment or newline != None:
                    pass
                else:
                    newline = line
                if newline != None:
                    newline = " ".join(newline.split(" "))
                    newsents.append(newline)
                newline = None
        result = []
        for line in newsents:
            newline = ""
            for word in line.split(" "):
                if word != "":
                    newline += word + " "
            result.append(newline)
        return result
def summarize_cisco_support_forum_texts():
    # cisco_plain_text = LazyCorpusLoader(
    #    'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1')
    cisco_plain_text = LazyCorpusLoader(
        "cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1"
    )
    token_dict = {}
    for article in cisco_plain_text.fileids():
        token_dict[article] = cisco_plain_text.raw(article)

    tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore")

    sys.stdout.flush()

    # creates Compressed Sparse Row format numpy matrix
    tdm = tfidf.fit_transform(token_dict.values())
    feature_names = tfidf.get_feature_names()

    # problem_statement_#1 - summarize support_forum articles automatically
    for article_id in range(0, tdm.shape[0] - 2):
        article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id])
        sent_scores = []
        for sentence in nltk.sent_tokenize(article_text):
            score = 0
            sent_tokens = tokenize_and_stem(sentence)
            for token in (t for t in sent_tokens if t in feature_names):
                score += tdm[article_id, feature_names.index(token)]
            sent_scores.append((score / len(sent_tokens), sentence))
        summary_length = int(math.ceil(len(sent_scores) / 5))
        sent_scores.sort(key=lambda sent: sent[0])
        print "\n*** SUMMARY ***"
        for summary_sentence in sent_scores[:summary_length]:
            print summary_sentence[1]
        print "\n*** ORIGINAL ***"
        print article_text

    # problem_statement_#2 - automatically categorize forum posts by tags into various groups
    reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200)

    # problem_statement_#3 - find similar documents to a current document (that user is reading) automatically
    # eg - quora: find similar questions, find similar answers
    cosine_similarity(tdm[0:1], tdm)
    """
    output looks like this
    array([[ 1.        ,  0.22185251,  0.0215558 ,  0.03805012,  0.04796646,
         0.05069365,  0.05507056,  0.03374501,  0.03643342,  0.05308392,
         0.06002623,  0.0298806 ,  0.04177088,  0.0844478 ,  0.07951179,
         0.02822186,  0.03036787,  0.11022385,  0.0535391 ,  0.10009412,
         0.07432719,  0.03753424,  0.06596462,  0.01256566,  0.02135591,
         0.13931643,  0.03062681,  0.02595649,  0.04897851,  0.06276997,
         0.03173952,  0.01822134,  0.04043555,  0.06629454,  0.05436211,
         0.0549144 ,  0.04400169,  0.05157118,  0.05409632,  0.09541703,
         0.02473209,  0.05646599,  0.05728387,  0.04672681,  0.04519217,
         0.04126276,  0.06289187,  0.03116767,  0.04828476,  0.04745193,
         0.01404426,  0.04201325,  0.023492  ,  0.07138136,  0.03778315,
         0.03677206,  0.02553581]])
    The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251
    """

    cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten()

    # mapping back to document_name space
    related_docs_indices = cosine_similarities.argsort()
    """
    document_ids
    array([23, 50, 31, 24,  2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30,  7,  8,
       55, 21, 54,  3, 32, 45, 12, 51, 36, 44, 43, 49,  4, 48, 28,  5, 37,
        9, 18, 38, 34, 35,  6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13,
       39, 19, 17, 25,  1,  0])

       docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted)
        https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40
        and
        supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest
    """

    cosine_similarities[related_docs_indices]
    for key, value in token_dict.iteritems():
        print key, value
    # find the actual posts which are the most similar
    tfidf.inverse_transform(tdm)[0]
    tfidf.inverse_transform(tdm)[1]
示例#9
0
文件: bamana.py 项目: eldams/daba
wordlist = LazyCorpusLoader(
        'bamana/wordlist', PlaintextCorpusReader, r'bailleul.clean.wordlist', word_tokenizer=orthographic_word, encoding='utf-8')

properlist = LazyCorpusLoader(
        'bamana/propernames', PlaintextCorpusReader, r'.*\.clean\.wordlist', word_tokenizer=orthographic_word, encoding='utf-8')

propernames = LazyCorpusLoader(
        'bamana/propernames', ToolboxCorpusReader, '.*\.txt', encoding='utf-8')

bailleul = LazyCorpusLoader(
        'bamana/bailleul', ToolboxCorpusReader, r'bailleul.txt', encoding='utf-8')

lexicon = ElementTree(bailleul.xml('bailleul.txt'))

for file in propernames.fileids():
    for e in ElementTree(propernames.xml(file)).findall('record'):
        ge = Element('ge')
        ge.text = e.find('lx').text
        e.append(ge)
        ps = Element('ps')
        ps.text = 'n.prop'
        e.append(ps)
        lexicon.getroot().append(e)

wl = {}
wl_detone = {}

def normalize_bailleul(word):
    return u''.join([c for c in word if c not in u'.-'])
interrogazioni = LazyCorpusLoader(
    'opp_interrogazioni_macro',
    CategorizedPlaintextCorpusReader,
    r'\d*', cat_file='cats.txt', cat_delimiter=','
)

print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]


print "generating list of documents for each category"
documents = [
    (list(interrogazioni.words(fileid)), category)
    for category in interrogazioni.categories()
    for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)

print "building the classifier"
featuresets = [(document_features(d, top_words), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print "classifier accuracy: ", nltk.classify.accuracy(classifier, test_set)





示例#11
0
def loadClassifier(outputdir):
    classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") 
    word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
    if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
        word_features = pickleLoad("word_features.pickle")
#        classifier = pickleLoad("originalnaivebayes.pickle")
#        MNB_classifier = pickleLoad("MNB_classifier.pickle")
#        BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
#        LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
#        SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
#        LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#        
#        voted_classifier = VoteClassifier(classifier,
##                                  NuSVC_classifier,
#                                  LinearSVC_classifier,
#                                  SGDClassifier_classifier,
#                                  MNB_classifier,
#                                  BernoulliNB_classifier,
#                                  LogisticRegression_classifier)
        voted_classifier= pickleLoad("voted_classifier.pickle")
        return voted_classifier, word_features
    else:
        criticas_cine = LazyCorpusLoader(
                'criticas_cine', CategorizedPlaintextCorpusReader,
                r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
                encoding='utf-8')
#        criticas_cine = LazyCorpusLoader(
#                'criticas_cine_neu', CategorizedPlaintextCorpusReader,
#                r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
#                encoding='utf-8')
            
        documents = [(list(criticas_cine.words(fileid)), category)
                     for category in criticas_cine.categories()
                     for fileid in criticas_cine.fileids(category)]
#            
#        document_pos = [(list(criticas_cine.words(fileid)), "pos")
#                        for fileid in criticas_cine.fileids("pos")]
#        document_neg = [(list(criticas_cine.words(fileid)), "neg")
#                        for fileid in criticas_cine.fileids("neg")]
#        document_neu = [(list(criticas_cine.words(fileid)), "neu")
#                        for fileid in criticas_cine.fileids("neu")]
        
        random.shuffle(documents)
        
#        random.shuffle(document_pos)
#        random.shuffle(document_neg)
#        random.shuffle(document_neu)
        
        all_words = []
        
        for w in criticas_cine.words():
            all_words.append(w.lower())
        
#        for w in criticas_cine.words():
#            if not is_filtered(w.lower()):
#                all_words.append(w.lower())
#        
        all_words = nltk.FreqDist(all_words)
        
        #print (all_words.most_common(50))
        
        # Filtering by type of word
        
#        for sample in all_words:
                    
        
        word_features = list(all_words.keys())[:3000]
        pickleDump(word_features, "word_features.pickle")
        
        featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
        
#        featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
#        featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
#        featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
        
#        training_set = featuresetpos[:1000]
#        training_set.extend(featuresetneg[:1000])
#        training_set.extend(featuresetneu[:1000])
#        testing_set = featuresetpos[1000:1273]
#        testing_set.extend(featuresetneg[1000:])
#        testing_set.extend(featuresetneu[1000:])

#        pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
#        neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
#        neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
                
        training_set = featuresets[:2000]
        testing_set =  featuresets[2000:]
        classifier = nltk.NaiveBayesClassifier.train(training_set)
#        pickleDump(classifier, "originalnaivebayes.pickle")
    
        NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
        
        print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
        
        accuracy = Accuracy(classifier,testing_set)
        print(accuracy)
        # order: neu, neg, pos
#        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
#        print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#        print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3)
#        print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100)
        classifier.show_most_informative_features(15)
        
        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier.train(training_set)
        MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set)
        print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100)
#        pickleDump(MNB_classifier, "MNB_classifier.pickle")
        
        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier.train(training_set)
        BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
        print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100)
#        pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle")
        
        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training_set)
        LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set)
        print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100)
#        pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle")
        
        SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
        SGDClassifier_classifier.train(training_set)
        SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
        print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100)
#        pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle")
        
        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training_set)
        LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set)
        print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100)
#        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
#        SVC_classifier = SklearnClassifier(SVC())
#        SVC_classifier.train(training_set)
#        print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
        
        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier.train(training_set)
        NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set)
        print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100)
        #        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
        
#        pickleDump([NaiveBayesClassifierAccuracy, 
#                    LinearSVC_classifierAccuracy,
#                    SGDClassifier_classifierAccuracy,
#                    MNB_classifierAccuracy,
#                    BernoulliNB_classifierAccuracy,
#                    LogisticRegression_classifierAccuracy], "accuracies.pickle")
        
        voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy],
                                          [NuSVC_classifier,NuSVC_classifierAccuracy],
                                          [LinearSVC_classifier,LinearSVC_classifierAccuracy],
                                          [SGDClassifier_classifier,SGDClassifier_classifierAccuracy],
                                          [MNB_classifier,MNB_classifierAccuracy],
                                          [BernoulliNB_classifier,BernoulliNB_classifierAccuracy],
                                          [LogisticRegression_classifier,LogisticRegression_classifierAccuracy])

        accuracy = Accuracy(voted_classifier,testing_set)
        print(accuracy)
        VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set)
        print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100)
#        print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100)
        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2)
        print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2)
        print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2)
        print("------------------------------------------");
                                          
        pickleDump(voted_classifier, "voted_classifier.pickle")

        return voted_classifier, word_features
示例#12
0
def summarize_cisco_support_forum_texts():
    #cisco_plain_text = LazyCorpusLoader(
    #    'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1')
    cisco_plain_text = LazyCorpusLoader('cisco_forum_subset',
                                        PlaintextCorpusReader,
                                        r'(?!\.).*\.txt',
                                        encoding='latin_1')
    token_dict = {}
    for article in cisco_plain_text.fileids():
        token_dict[article] = cisco_plain_text.raw(article)

    tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem,
                            stop_words='english',
                            decode_error='ignore')

    sys.stdout.flush()

    #creates Compressed Sparse Row format numpy matrix
    tdm = tfidf.fit_transform(token_dict.values())
    feature_names = tfidf.get_feature_names()

    #problem_statement_#1 - summarize support_forum articles automatically
    for article_id in range(0, tdm.shape[0] - 2):
        article_text = cisco_plain_text.raw(
            cisco_plain_text.fileids()[article_id])
        sent_scores = []
        for sentence in nltk.sent_tokenize(article_text):
            score = 0
            sent_tokens = tokenize_and_stem(sentence)
            for token in (t for t in sent_tokens if t in feature_names):
                score += tdm[article_id, feature_names.index(token)]
            sent_scores.append((score / len(sent_tokens), sentence))
        summary_length = int(math.ceil(len(sent_scores) / 5))
        sent_scores.sort(key=lambda sent: sent[0])
        print '\n*** SUMMARY ***'
        for summary_sentence in sent_scores[:summary_length]:
            print summary_sentence[1]
        print '\n*** ORIGINAL ***'
        print article_text

    #problem_statement_#2 - automatically categorize forum posts by tags into various groups
    reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200)

    #problem_statement_#3 - find similar documents to a current document (that user is reading) automatically
    #eg - quora: find similar questions, find similar answers
    cosine_similarity(tdm[0:1], tdm)
    '''
    output looks like this
    array([[ 1.        ,  0.22185251,  0.0215558 ,  0.03805012,  0.04796646,
         0.05069365,  0.05507056,  0.03374501,  0.03643342,  0.05308392,
         0.06002623,  0.0298806 ,  0.04177088,  0.0844478 ,  0.07951179,
         0.02822186,  0.03036787,  0.11022385,  0.0535391 ,  0.10009412,
         0.07432719,  0.03753424,  0.06596462,  0.01256566,  0.02135591,
         0.13931643,  0.03062681,  0.02595649,  0.04897851,  0.06276997,
         0.03173952,  0.01822134,  0.04043555,  0.06629454,  0.05436211,
         0.0549144 ,  0.04400169,  0.05157118,  0.05409632,  0.09541703,
         0.02473209,  0.05646599,  0.05728387,  0.04672681,  0.04519217,
         0.04126276,  0.06289187,  0.03116767,  0.04828476,  0.04745193,
         0.01404426,  0.04201325,  0.023492  ,  0.07138136,  0.03778315,
         0.03677206,  0.02553581]])
    The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251
    '''

    cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten()

    #mapping back to document_name space
    related_docs_indices = cosine_similarities.argsort()
    '''
    document_ids
    array([23, 50, 31, 24,  2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30,  7,  8,
       55, 21, 54,  3, 32, 45, 12, 51, 36, 44, 43, 49,  4, 48, 28,  5, 37,
        9, 18, 38, 34, 35,  6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13,
       39, 19, 17, 25,  1,  0])

       docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted)
        https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40
        and
        supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest
    '''

    cosine_similarities[related_docs_indices]
    for key, value in token_dict.iteritems():
        print key, value
    #find the actual posts which are the most similar
    tfidf.inverse_transform(tdm)[0]
    tfidf.inverse_transform(tdm)[1]
示例#13
0
train_test_ratio = 2.0/3



def pickleObject():
	obj = classifier
	savefile = open('classifier.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)

def pickleFeats():
	obj = words_in_sentence
	savefile = open('feats.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)

files_in_neg = movie_reviews.fileids('neg')
files_in_pos = movie_reviews.fileids('pos')

neg_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'neg') for f in files_in_neg]
pos_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'pos') for f in files_in_pos]

negative_first_test_pos = int(len(neg_data)*train_test_ratio)
positive_first_test_pos = int(len(pos_data)*train_test_ratio)

train_data = neg_data[:negative_first_test_pos] + pos_data[:positive_first_test_pos]
test_data = neg_data[negative_first_test_pos:] + pos_data[positive_first_test_pos:]
print 'training on %d paragraphs and testing on %d paragraphs' % (len(train_data), len(test_data))

classifier = NaiveBayesClassifier.train(train_data)
print 'accuracy:', nltk.classify.util.accuracy(classifier, test_data)
classifier.show_most_informative_features(20)
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.text import TextCollection
from collections import Counter
from nltk.tokenize import word_tokenize
from operator import itemgetter



dialect = LazyCorpusLoader('dialects1', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(egyptian|gulf|levantine|standardArabic)/.*', encoding="utf-8")
x = TextCollection(dialect)

sentences = [(list(dialect.words(fileid)[i:i+40]),category)
             for category in dialect.categories()
             for fileid in dialect.fileids(category)
             for i in range(0,len(dialect.words(fileid)),40)]


shuffled_sentences = random.sample(sentences, len(sentences))
print('sentences count',len(sentences))

text = dialect.words()
print('words count',len(text))

#################### Test with getting topN ############################################################################
# all_words = nltk.FreqDist(w for w in dialect.words())
# Mcommon = all_words.most_common(4000)
# topN = [i[0] for i in Mcommon]
# print('finished topN')
########################################################################################################################
示例#15
0
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import WordListCorpusReader

reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist.txt'])
print(isinstance(reader, LazyCorpusLoader))

print(reader.fileids())
print(isinstance(reader, LazyCorpusLoader))
print(isinstance(reader, WordListCorpusReader))
示例#16
0
def train_test_split(min=25):
    '''
    Gets the train/test set for testing from the Reuters corpus.  Keeps only
    the documents that have a category in both test and train set, with a
    possible user defined miniumum number of tokens.

    Parameters:
    ------------
    min: (default = 25) The minimum number of tokens in documents to test with.

    Returns:
    ------------
    train_set, train_target, test_set, test_target as lists
    '''
    #imports
    import re
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus.util import LazyCorpusLoader
    from nltk.corpus.reader import CategorizedPlaintextCorpusReader

    #reading corpus
    reuters = LazyCorpusLoader('reuters',
                               CategorizedPlaintextCorpusReader,
                               '(training|test).*',
                               cat_file='cats.txt',
                               encoding='ISO-8859-2')

    documents = reuters.fileids()
    #spliting into train and test sets
    train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                                documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
    #getting documents and their categories
    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    train_cat = [reuters.categories(doc_id) for doc_id in train_docs_id]
    test_cat = [reuters.categories(doc_id) for doc_id in test_docs_id]

    #formating the train set, tokenizing and gathering stats for processing
    train_token_docs = []
    train_token_docs_length = []
    train_token_docs_unique = []
    for i in train_docs:
        tempy_tokens = tokenize(i)
        train_token_docs.append(" ".join(tempy_tokens))
        train_token_docs_length.append(len(tempy_tokens))
        train_token_docs_unique.append(len(set(tempy_tokens)))

    #formating the test set, tokenizing and gathering stats for processing
    test_token_docs = []
    test_token_docs_length = []
    test_token_docs_unique = []
    for i in test_docs:
        tempy_tokens = tokenize(i)
        test_token_docs.append(" ".join(tempy_tokens))
        test_token_docs_length.append(len(tempy_tokens))
        test_token_docs_unique.append(len(set(tempy_tokens)))

    #removes any documents that do not meet the minimum tokens setting
    train_less_than_min = [
        n for n, i in enumerate(train_token_docs_length) if i < min
    ]
    test_less_than_min = [
        n for n, i in enumerate(test_token_docs_length) if i < min
    ]

    train_token_docs_more_than_min = [
        i for n, i in enumerate(train_token_docs)
        if n not in train_less_than_min
    ]

    test_token_docs_more_than_min = [
        i for n, i in enumerate(test_token_docs) if n not in test_less_than_min
    ]
    train_cat_more_than_min = [
        i for n, i in enumerate(train_cat) if n not in train_less_than_min
    ]
    test_cat_more_than_min = [
        i for n, i in enumerate(test_cat) if n not in test_less_than_min
    ]

    #getting documents with single categories
    #(corpus has some with multiple categories)
    cat_count_train = [len(i) for i in train_cat_more_than_min]
    cat_count_test = [len(i) for i in test_cat_more_than_min]

    single_cat_train = [n for n, i in enumerate(cat_count_train) if i == 1]
    single_cat_test = [n for n, i in enumerate(cat_count_test) if i == 1]

    train_single = [
        i for n, i in enumerate(train_token_docs_more_than_min)
        if n in single_cat_train
    ]
    test_single = [
        i for n, i in enumerate(test_token_docs_more_than_min)
        if n in single_cat_test
    ]
    train_single_cat = [
        i for n, i in enumerate(train_cat_more_than_min)
        if n in single_cat_train
    ]
    test_single_cat = [
        i for n, i in enumerate(test_cat_more_than_min) if n in single_cat_test
    ]

    train_cat_set = set([i[0] for i in train_single_cat])
    test_cat_set = set([i[0] for i in test_single_cat])

    mutual_cat = train_cat_set.intersection(test_cat_set)

    member_of_mutual_test = [
        n for n, i in enumerate(test_single_cat) if i[0] in mutual_cat
    ]
    member_of_mutual_train = [
        n for n, i in enumerate(train_single_cat) if i[0] in mutual_cat
    ]

    train_single2 = [
        i for n, i in enumerate(train_single) if n in member_of_mutual_train
    ]
    test_single2 = [
        i for n, i in enumerate(test_single) if n in member_of_mutual_test
    ]
    train_single_cat2 = [
        i for n, i in enumerate(train_single_cat)
        if n in member_of_mutual_train
    ]
    test_single_cat2 = [
        i for n, i in enumerate(test_single_cat) if n in member_of_mutual_test
    ]

    return train_single2, train_single_cat2, test_single2, test_single_cat2
示例#17
0
from nltk.data import path
from nltk.tree import *

path.extend(['./testing/'])


s = '---'
print(s.split('-'))


icepahc = LazyCorpusLoader(
    'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader,
    r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*'
)

fileids = icepahc.fileids() # TODO: Get fileid info per tree for tree ids
# sents = icepahc.parsed_sents()

def sent_text(sentence):
    '''
    Takes in a nltk Tree object and returns the sentence text in string form
    '''
    text = []
    leaves = sentence.pos()
    for leaf in leaves:
        # print(leaf)
        if len(leaf[0]) == 1:
            leaf = leaf[1]
        elif leaf[0] == '---':
            leaf = '-'
        elif leaf[0] == '-----':
示例#18
0
def main():

    IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd'
    FIXED_IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd_fix'

    fix_annotation_errors(IcePaHC_DIR, FIXED_IcePaHC_DIR)

    run_pre(FIXED_IcePaHC_DIR)

    path.extend(['..'])

    ICEPAHC = LazyCorpusLoader(
        'icepahc-v0.9/psd_fix/', IcePaHCFormatReader,
        r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*'
        )

    fileids = ICEPAHC.fileids()  # leave uncommented for whole corpus use
    # fileids = ['1150.homiliubok.rel-ser.psd'] # For debug use only
    # fileids = ['2008.mamma.nar-fic.psd', '2008.ofsi.nar-sag.psd'] # For debug use only

    # Instance of Converter class
    c = Converter(auto_tags='corpus')
    # c = Converter()
    total_sents = 0
    file_num = 1

    # OUTPUT_DIR = '../testing/CoNLLU_output/'
    OUTPUT_DIR = '../IcePaHC-CoNLLU/'
    if not os.path.isdir(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    # f = open('ofsi_conllu.conllu', 'w+')

    ''' Prints the dependency graph data in conllU format '''
    for fileid in fileids:

        OUT_FILE_NAME = re.sub(r'\.psd', '.conllu', fileid)
        OUT_FILE_PATH = OUTPUT_DIR + OUT_FILE_NAME
        OUT_FILE = open(OUT_FILE_PATH, 'w+')

        # file_length = len(ICEPAHC.parsed_sents(fileid))

        error_num = 0
        start = time.time()
        file_sents = 0

        # print('\nProcessing file: {0}...'.format(fileid))
        tree_counter = 0

        tag_dict = tagged_corpus(ICEPAHC.parsed_sents(fileid))
        c.set_tag_dict(tag_dict)

        to_join = []
        try:
            for tree in ICEPAHC.parsed_sents(fileid):

                # Catch error in corpus where punctuation tokens are missing
                tree = fix_IcePaHC_tree_errors(tree)

                # UniversalDependencyGraph object created from tree
                dep = c.create_dependency_graph(tree)

                # Sentences split between clauses joined together and output written
                # to file
                if dep.get_by_address(len(dep.nodes)-1)['word'] not in {'.', ':', '?', '!', 'kafli', 'kapítuli'} \
                and len(dep.nodes) != 1:
                    to_join.append(dep)
                else:
                    if len(to_join) == 0:
                        # write out dep. graphs that don't need to be joined
                        dep_c = c.add_space_after(dep).to_conllU()

                        sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1)
                        sent_id_line = '# sent_id = ' + sent_id + '\n'

                        text_line = dep.plain_text()+'\n'
                        # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n'
                        icepahc_id_line = str(dep.original_ID_plain_text()) + '\n'
                        OUT_FILE.write(sent_id_line)
                        OUT_FILE.write(icepahc_id_line)
                        OUT_FILE.write(text_line)
                        OUT_FILE.write(dep_c)
                        file_sents += 1
                        total_sents += 1
                    else:
                        # write out joined dependency graphs
                        to_join.append(dep)
                        dep = c.add_space_after(c.join_graphs(to_join))
                        dep_c = dep.to_conllU()

                        sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1)
                        sent_id_line = '# sent_id = ' + sent_id + '\n'

                        text_line = dep.plain_text()+'\n'
                        # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n'
                        icepahc_id_line = str(dep.original_ID_plain_text()) + '\n'
                        OUT_FILE.write(sent_id_line)
                        OUT_FILE.write(icepahc_id_line)
                        OUT_FILE.write(text_line)
                        OUT_FILE.write(dep_c)
                        file_sents += 1
                        total_sents += 1
                    to_join = []

                tree_counter += 1
        except Exception as ex:
            print('ERROR', '# sent_id =', sent_id)
            print(tree.corpus_id)
            print(tree)
            print('Failure - {0}. Arguments:\n{1!r}'.format(type(ex).__name__, ex.args))
            raise
            error_num += 1

        run_post_file(OUT_FILE_PATH)

        end = time.time()
        duration = '%.2f' % float(end - start)
        # if error_num > 0:
        print('\t'.join([str(i) for i in [file_num, fileid, tree_counter, file_sents, error_num, str(duration)+' sec']]))
        file_num += 1
示例#19
0
from collections import defaultdict
import time
import re
import string

path.extend(['./testing/'])

DMII_combined = DMII_data.load_json('combined') # TODO: Move to features script

icepahc = LazyCorpusLoader(
    'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader,
    r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*'
)

if __name__ == '__main__':
    fileids = icepahc.fileids() # leave uncommented for whole corpus use
    #fileids = ['1350.bandamennM.nar-sag.psd'] # For debug use only
    c = Converter() # Creates instance of Converter class
    total_sents = 0
    file_num = 1

    # f = open('homilia_conllu.conllu', 'w+')

    ''' Prints the dependency graph data in conllU format '''
    for fileid in fileids:
        error_num = 0
        start = time.time()
        file_sents = 0
        #print('\nProcessing file: {0}...'.format(fileid))
        for tree in icepahc.parsed_sents(fileid):
            treeID = fileid + '_' + str(file_sents+1) + '_' + str(total_sents+1)
示例#20
0
        return source


connection = pyodbc.connect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=E:\\farsnettest.mdb")
c = connection.cursor()
#c.execute("select number,example from shir")
corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml')
word=u'شیر'
targ = 0
c.execute("select * from shir")
for row in c:
    print row

#out = codecs.open('d:\\shirham.txt','w','utf-8')
for file in corpus.fileids():
#
#   #if num==1000: break
   for doc in  corpus.xml(file).getchildren():
#
          cat=doc.getchildren()[3].text#
          text=doc.getchildren()[5].text
          newtext=correctPersianString(text)
          allwords=text.split()
          sents=newtext.split('.')

          for sent in sents:


             if word in sent.split():
                 targ+=1