def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print knbc.fileids()[:10] print ''.join( knbc.words()[:100] ) print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] )
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp") print knbc.fileids()[:10] print "".join(knbc.words()[:100]) print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
def parse_wsj(processes=8): ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') fileids = ptb.fileids() params = [] for f in fileids: corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f)) for i, (parsed, tagged) in enumerate(corpus): params.append((f, i, parsed, tagged)) p = Pool(processes) p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print( "\n".join( " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
class DealCorpus: count = 0 def __init__(self): '初始化爬取所需要的信息' DealCorpus.count += 1 # 初始化 使用的项目 GithubRepo self.initlog() self.loaddiff() def loaddiff(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" pattern_2 = r".*/diff2/.*\.txt" pattern_3 = r".*/diff3/.*\.txt" self.logger.info("加载语料库 lazyload") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2) self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3) self.logger.info("加载语料库 完毕") def log(self,type): import json if not os.path.exists(self.logfile): load_dict={'java_sanit': False,'java_merge':False,'java_dict':False,'java_model':False, 'xml_sanit': False, 'xml_merge': False, 'xml_dict': False, 'xml_model': False,} logging.info(load_dict) with open(self.logfile, "w") as dump_f: json.dump(load_dict, dump_f) dump_f.close() with open(self.logfile, 'r') as load_f: load_dict = json.load(load_f) print load_dict load_f.close() if type == DealCorpusTask.JAVA_DICT: load_dict['java_dict'] = True if type == DealCorpusTask.JAVA_MODEL: load_dict['java_model'] = True if type == DealCorpusTask.JAVA_MERGE: load_dict['java_merge'] = True if type == DealCorpusTask.JAVA_SANIT: load_dict['java_sanit'] = True if type == DealCorpusTask.XML_DICT: load_dict['xml_dict'] = True if type == DealCorpusTask.XML_MODEL: load_dict['xml_model'] = True if type == DealCorpusTask.XML_MERGE: load_dict['xml_merge'] = True if type == DealCorpusTask.XML_SANIT: load_dict['xml_sanit'] = True logging.info(load_dict) self.java_dict = load_dict['java_dict'] self.java_model = load_dict['java_model'] self.java_predeal = load_dict['java_merge'] self.java_sanit = load_dict['java_sanit'] self.xml_dict = load_dict['xml_dict'] self.xml_model = load_dict['xml_model'] self.xml_predeal = load_dict['xml_merge'] self.xml_sanit = load_dict['xml_sanit'] # print "++++++++++++++++++++++++++++",load_dict with open(self.logfile, "w") as dump_f: json.dump(load_dict, dump_f) dump_f.close() def initlog(self): self.logfile = os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus.txt") print self.logfile self.java_dict =False self.java_model=False self.java_pic= False self.java_predeal =False self.java_sanit =False self.xml_dict = False self.xml_model = False self.xml_pic = False self.xml_predeal = False self.xml_sanit = False self.log(None) self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpusOutput.txt") if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(), "data4")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4")) self.merge_corpus_java_file = os.path.join(GetFilePathRoot.get_root_dir(), "data4", "merge_corpus_java.txt") self.dict_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","java_dict_corpus.txt") self.model_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","model_corpus_java") import logging # 创建一个logger self.logger = logging.getLogger("DealCorpus") self.logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.outputfile) fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler self.logger.addHandler(fh) self.logger.addHandler(ch) def getAll(self): # return self.logger.info("deal java") if not self.java_sanit: self.logger.info("extract") self.getsents() if not self.java_predeal: self.logger.info("merge") self.mergecorpus() if not self.java_dict: self.logger.info("java_dict") self.getdict() if not self.java_model: self.logger.info("java_model") self.getmodel() self.logger.info("deal xml") if not self.xml_sanit: self.logger.info("xml_sanit") if not self.xml_predeal: self.logger.info("predeal") if not self.xml_dict: self.logger.info("xml_dict") if not self.xml_model: self.logger.info("xml_model") def mergecorpus(self,type="java"): if type =="java": rootDir=os.path.join(GetFilePathRoot.get_root_dir(),"data3") targetfile=self.merge_corpus_java_file i = 0 num=len(os.listdir(rootDir)) k = open(targetfile, 'w') for lists in os.listdir(rootDir): path = os.path.join(rootDir, lists) i+=1 self.logger.info((float(i)/num,path)) with open(path,"r") as f: # print f.read() k.write(f.read()+"\n") k.close() self.log(DealCorpusTask.JAVA_MERGE) pass def getdict(self,type="java"): if type == "java": self.sourcefilename=self.merge_corpus_java_file self.targetfilename=self.dict_corpus_java_file logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(self.sourcefilename) # 加载语料 dictionary = gensim.corpora.Dictionary(sentences) print len(dictionary) dictionary.save_as_text(self.targetfilename, sort_by_word=True) if type=="java": self.log(DealCorpusTask.JAVA_DICT) def getmodel(self,type="java"): if type == "java": self.sourcefilename=self.merge_corpus_java_file self.targetfilename=self.model_corpus_java_file logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(self.sourcefilename) # 加载语料 model = word2vec.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5 # 保存模型,以便重用 model.save(self.targetfilename) if type=="java": self.log(DealCorpusTask.JAVA_MODEL) def getfile(self,type="java"): if type == "java": fileids=self.diff2.fileids() for file in fileids: print file yield file,self.diff2.raw(file) if type == "xml": fileids = self.diff3.fileids() for file in fileids: print file yield file,self.diff3.raw(file) def getsha(self,string): from hashlib import sha1 ss= sha1(string) return ss.hexdigest() def getsents(self,type="java"): self.logger.info("getsents") if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(),"data3")) self.sha_name_file=os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus_sha_name_csv") i=0 self.logger.info(i) for name,file in self.getfile(type): i+=1 self.logger.info((i,name)) sents= file.replace("\r\n","\n").replace("\r","\n").split("\n") result = self.getpredeal(sents=sents) with open(self.sha_name_file, "a") as f: ff = csv.writer(f) ff.writerow([self.getsha(name), name]) f.close() if os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name))): continue with open(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name)),"w") as f: f.write(result) f.close() self.log(DealCorpusTask.JAVA_SANIT) def getpredeal(self,sents): # 加载停用词表 newsents=[] stop=["+","-","*","/","%","=","!",">","<","&","|","^","~","?","(",")","[","]","{","}",",",";"] comment=False for line in sents: import re if re.search("(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", line, re.I): line= line.replace(re.search("(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", line, re.I).group(1),"") # print line if line != u' ': newline =None if "//" in line: newline = line[0:line.index("//")] if "/*" in line: comment =True newline = line[0:line.index("/*")] if "*/" in line: newline=line[line.index("*/")+2:len(line) ] # print newline self.logger.info(newline) comment = False if "/*"in line and "*/" in line: if line.index("/*")<=line.index("*/"): newline = line[0:line.index("/*")]+line[line.index("*/")+2:len(line)] comment = False if comment or newline !=None: pass else: newline=line if newline !=None: for s in stop: if s in newline: newline=" ".join(newline.split(s)) newline= newline.replace("\r\n", " ").replace("\n", " ").replace("\t", " ") newline = " ".join(newline.split(" ")) newsents.append(newline) newline=None result="" for line in newsents: for word in line.split(" "): if word != "": result+=word+" " return result def gendiffpicture(self, word="activity", type="java", size=3): if type=="java": modelfile = self.model_corpus_java_file logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = word2vec.Word2Vec.load(modelfile) G = nx.Graph() blacknode = set([]) def get(startwordlist): result = [] for word in startwordlist: y2 = model.most_similar(word,topn=6) # 6个最相关的 for item in y2: if not item[0] in blacknode: result.append(item[0]) G.add_node(item[0]) G.add_weighted_edges_from([(word, item[0], item[1])]) blacknode.add(word) return result start = [word] i = 0; while True: i += 1; if i == size: break result = get(start) print result start = result for node in G.nodes(): print node pos = nx.spring_layout(G) nx.draw(G, pos=pos, node_color="r", with_labels=True, node_size=900, font_size=10) plt.show() def getsmailarword(self, word="activity", type="java", number=10): '获得相似单词' if type == "java": modelfile = self.model_corpus_java_file logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = word2vec.Word2Vec.load(modelfile) return model.most_similar(word, topn=number)
class ExcateMode: count = 0 def __init__(self): '初始化爬取所需要的信息' ExcateMode.count += 1 # 初始化 使用的项目 GithubRepo self.initlog() self.loaddiff() def loaddiff(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" self.logger.info("加载语料库 lazyload") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.logger.info("加载语料库 完毕") def log(self, type): import json if not os.path.exists(self.logfile): load_dict = {'extract': False, 'merge': False} logging.info(load_dict) with open(self.logfile, "w") as dump_f: json.dump(load_dict, dump_f) dump_f.close() with open(self.logfile, 'r') as load_f: load_dict = json.load(load_f) print load_dict load_f.close() if type == ExtractModeTask.MERGE: load_dict['merge'] = True if type == ExtractModeTask.EXTRACT: load_dict['extract'] = True logging.info(load_dict) self.merge = load_dict['merge'] self.extract = load_dict['extract'] # print "++++++++++++++++++++++++++++",load_dict with open(self.logfile, "w") as dump_f: json.dump(load_dict, dump_f) dump_f.close() def initlog(self): self.logfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "ExtractModel.txt") print self.logfile self.merge = False self.extract = False self.start_prefix = "##### start" self.change_prefix = "###### change :" self.name_prefix = "###### name :" self.end_prefix = "##### end" self.model_corpus_java_file = os.path.join( GetFilePathRoot.get_root_dir(), "data4", "model_corpus_java") modelfile = self.model_corpus_java_file self.model = word2vec.Word2Vec.load(modelfile) self.log(None) self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "ExtractModelOutput.txt") if not os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data4")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4")) self.merge_extract_java_file = os.path.join( GetFilePathRoot.get_root_dir(), "data4", "merge_extract_java.txt") import logging # 创建一个logger self.logger = logging.getLogger("DealCorpus") self.logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.outputfile) fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler self.logger.addHandler(fh) self.logger.addHandler(ch) def getAll(self): # with open("test_1","r") as f: # file= f.read() # sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n") # ## 进行处理 # dealsents = self.getpredeal(sents=sents) # all = self.getexcate(dealsents) # return self.logger.info("deal java") if not self.extract: self.logger.info("extract") self.getsents() def getfile(self, type="java"): fileids = self.diff1.fileids() for file in fileids: print file yield file, self.diff1.raw(file) def getsha(self, string): from hashlib import sha1 ss = sha1(string) return ss.hexdigest() def getsents(self, type="java"): self.logger.info("getsents") import json # 存储 diff1的语料处理后的模型 if not os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data5")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data5")) # 记录原始文件和模型提取结果的映射 self.sha_name_file = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "extract_sha_name_csv") i = 0 self.logger.info(i) for name, file in self.getfile(type): i += 1 self.logger.info((i, name)) sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n") ## 进行处理 dealsents = self.getpredeal(sents=sents) result = self.getexcate(dealsents) # 保存的文件 csv with open(self.sha_name_file, "a") as f: ff = csv.writer(f) ff.writerow([self.getsha(name), name]) f.close() if os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data5", self.getsha(name) + ".json")): continue with open( os.path.join(GetFilePathRoot.get_root_dir(), "data5", self.getsha(name)) + ".json", "w") as dump_f: json.dump(result, dump_f) dump_f.close() self.log(ExtractModeTask.EXTRACT) return def getexcate(self, dealsents): all = [] commit = [] file = [] change = {'del': [], "add": [], "content": [], "diff": []} commitstart = False for sent in dealsents: if self.start_prefix in sent: commitstart = True if commitstart == True: if self.change_prefix in sent: pass elif self.name_prefix in sent: if change != { 'del': [], "add": [], "content": [], "diff": [] }: file.append(change) commit.append(file) # print file file = [] change = { 'del': [], "add": [], "content": [], "diff": [] } elif self.start_prefix in sent: pass # print sent elif self.end_prefix in sent: commitstart = False if change != { 'del': [], "add": [], "content": [], "diff": [] }: file.append(change) commit.append(file) all.append(commit) commit = [] file = [] change = { 'del': [], "add": [], "content": [], "diff": [] } else: import re if re.search( "(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", sent, re.I): sent = sent.replace( re.search( "(@@\s*\-[0-9]+,[0-9]+\s*\+[0-9]+,[0-9]+\s*@@)", sent, re.I).group(1), "") if change != { 'del': [], "add": [], "content": [], "diff": [] }: file.append(change) change = { 'del': [], "add": [], "content": [], "diff": [] } if sent.replace(" ", "") != "": if sent.startswith("-") and sent.replace( "-", "").replace(" ", "") != "": change['del'].append(sent) elif sent.startswith("+") and sent.replace( "+", "").replace(" ", "") != "": change['add'].append(sent) else: if sent.replace("-", "").replace( " ", "") != "" and sent.replace( "+", "").replace(" ", "") != "": change["content"].append(sent) else: if sent.replace(" ", "") != "": if sent.startswith("-") and sent.replace( "-", "").replace(" ", ""): change['del'].append(sent) elif sent.startswith("+") and sent.replace( "+", "").replace(" ", ""): change['add'].append(sent) else: if sent.replace("-", "").replace( " ", "") != "" and sent.replace( "+", "").replace(" ", "") != "": change["content"].append(sent) for commit in all: # print "aaaaaaa",commit for file in commit: # print "ffffffff" ,file for change in file: # print "cccccccccccc",change # print " ",change['content'] change['diff'] = self.getsmailer(change['add'], change['del']) for diff in change['diff']: if diff[1] in change['add']: # print " remove add" change['add'].remove(diff[1]) if diff[2] in change['del']: # print " remove del" change['del'].remove(diff[2]) # print " ", change['del'] # print " ", change['add'] # for diff in change['diff']: # print " dddddddd",diff return all def emd(self, dist, w1, w2): import numpy as np import rpy2.robjects as robjects # 从R中导入lp.transport() robjects.r['library']('lpSolve') transport = robjects.r['lp.transport'] """R的transport()函数用来计算EMD""" # transport()的参数 costs = robjects.r['matrix'](robjects.FloatVector(dist), nrow=len(w1), ncol=len(w2), byrow=True) row_signs = ["<"] * len(w1) row_rhs = robjects.FloatVector(w1) col_signs = [">"] * len(w2) col_rhs = robjects.FloatVector(w2) t = transport(costs, "min", row_signs, row_rhs, col_signs, col_rhs) flow = t.rx2('solution') dist = dist.reshape(len(w1), len(w2)) flow = np.array(flow) work = np.sum(flow * dist) # print "***", (np.sum(flow)),work emd = (work + np.float64(2)) / (np.sum(flow) + np.float64(0.1)) return emd def getsentencesmaliar(self, scenceA, scenceB): # 使用词向量获得语言相似度,有两个问题,1.时间。2.目前只训练了java am = self.model import numpy as np f1 = scenceA.split() f2 = scenceB.split() n1 = len(scenceA.split()) n2 = len(scenceB.split()) # 创建一个距离矩阵 dist = np.zeros(n1 * n2) for i in range(n1): for j in range(n2): try: t1 = am.wv[f1[i]] except KeyError: continue pass try: t2 = am.wv[f2[j]] except: continue pass dist[i * n2 + j] = self.euclid_dist(t1, t2) + 0.01 first_signature = np.ones(n1) second_signature = np.ones(n2) # print "*****", dist return 1.0 / (1 + self.emd(dist, first_signature, second_signature)) def euclid_dist(self, feature1, feature2): """计算欧氏距离""" if len(feature1) != len(feature2): print "ERROR: calc euclid_dist: %d <=> %d" % (len(feature1), len(feature2)) return -1 return np.sqrt(np.sum((feature1 - feature2)**2)) def getsmailer(self, add=[], dele=[]): # 时间代价有点高 result = [] if len(add) <= len(dele): for sa in add: minvalue = 0 temp = ("", "") for sd in dele: import difflib seq = difflib.SequenceMatcher(None, sa, sd) ratio = seq.ratio() # print ratio,(nsa,nsd) smailarvalue = ratio if smailarvalue > minvalue: minvalue = smailarvalue temp = (sa, sd) if minvalue != 0: result.append((minvalue, temp[0], temp[1])) # print minvalue,self.getsentencesmaliar(temp[2],temp[3]), (temp[0],temp[1]) elif len(add) > len(dele): for sd in dele: minvalue = 0 temp = ("", "") for sa in add: import difflib seq = difflib.SequenceMatcher(None, sa, sd) ratio = seq.ratio() # print ratio, (nsa, nsd) smailarvalue = ratio if smailarvalue > minvalue: minvalue = smailarvalue temp = (sa, sd) if minvalue != 0: result.append((minvalue, temp[0], temp[1])) # print minvalue, self.getsentencesmaliar(temp[2], temp[3]), (temp[0], temp[1]) return result def getpredeal(self, sents): # 加载停用词表 newsents = [] comment = False for line in sents: if line != u' ': newline = None if "//" in line: newline = line[0:line.index("//")] if "/*" in line: comment = True newline = line[0:line.index("/*")] if "*/" in line: newline = line[line.index("*/") + 2:len(line)] # print newline self.logger.info(newline) comment = False if "/*" in line and "*/" in line: if line.index("/*") <= line.index("*/"): newline = line[0:line. index("/*")] + line[line.index("*/") + 2:len(line)] comment = False if comment or newline != None: pass else: newline = line if newline != None: newline = " ".join(newline.split(" ")) newsents.append(newline) newline = None result = [] for line in newsents: newline = "" for word in line.split(" "): if word != "": newline += word + " " result.append(newline) return result
def summarize_cisco_support_forum_texts(): # cisco_plain_text = LazyCorpusLoader( # 'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1') cisco_plain_text = LazyCorpusLoader( "cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1" ) token_dict = {} for article in cisco_plain_text.fileids(): token_dict[article] = cisco_plain_text.raw(article) tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore") sys.stdout.flush() # creates Compressed Sparse Row format numpy matrix tdm = tfidf.fit_transform(token_dict.values()) feature_names = tfidf.get_feature_names() # problem_statement_#1 - summarize support_forum articles automatically for article_id in range(0, tdm.shape[0] - 2): article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id]) sent_scores = [] for sentence in nltk.sent_tokenize(article_text): score = 0 sent_tokens = tokenize_and_stem(sentence) for token in (t for t in sent_tokens if t in feature_names): score += tdm[article_id, feature_names.index(token)] sent_scores.append((score / len(sent_tokens), sentence)) summary_length = int(math.ceil(len(sent_scores) / 5)) sent_scores.sort(key=lambda sent: sent[0]) print "\n*** SUMMARY ***" for summary_sentence in sent_scores[:summary_length]: print summary_sentence[1] print "\n*** ORIGINAL ***" print article_text # problem_statement_#2 - automatically categorize forum posts by tags into various groups reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200) # problem_statement_#3 - find similar documents to a current document (that user is reading) automatically # eg - quora: find similar questions, find similar answers cosine_similarity(tdm[0:1], tdm) """ output looks like this array([[ 1. , 0.22185251, 0.0215558 , 0.03805012, 0.04796646, 0.05069365, 0.05507056, 0.03374501, 0.03643342, 0.05308392, 0.06002623, 0.0298806 , 0.04177088, 0.0844478 , 0.07951179, 0.02822186, 0.03036787, 0.11022385, 0.0535391 , 0.10009412, 0.07432719, 0.03753424, 0.06596462, 0.01256566, 0.02135591, 0.13931643, 0.03062681, 0.02595649, 0.04897851, 0.06276997, 0.03173952, 0.01822134, 0.04043555, 0.06629454, 0.05436211, 0.0549144 , 0.04400169, 0.05157118, 0.05409632, 0.09541703, 0.02473209, 0.05646599, 0.05728387, 0.04672681, 0.04519217, 0.04126276, 0.06289187, 0.03116767, 0.04828476, 0.04745193, 0.01404426, 0.04201325, 0.023492 , 0.07138136, 0.03778315, 0.03677206, 0.02553581]]) The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251 """ cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten() # mapping back to document_name space related_docs_indices = cosine_similarities.argsort() """ document_ids array([23, 50, 31, 24, 2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30, 7, 8, 55, 21, 54, 3, 32, 45, 12, 51, 36, 44, 43, 49, 4, 48, 28, 5, 37, 9, 18, 38, 34, 35, 6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13, 39, 19, 17, 25, 1, 0]) docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted) https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40 and supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest """ cosine_similarities[related_docs_indices] for key, value in token_dict.iteritems(): print key, value # find the actual posts which are the most similar tfidf.inverse_transform(tdm)[0] tfidf.inverse_transform(tdm)[1]
wordlist = LazyCorpusLoader( 'bamana/wordlist', PlaintextCorpusReader, r'bailleul.clean.wordlist', word_tokenizer=orthographic_word, encoding='utf-8') properlist = LazyCorpusLoader( 'bamana/propernames', PlaintextCorpusReader, r'.*\.clean\.wordlist', word_tokenizer=orthographic_word, encoding='utf-8') propernames = LazyCorpusLoader( 'bamana/propernames', ToolboxCorpusReader, '.*\.txt', encoding='utf-8') bailleul = LazyCorpusLoader( 'bamana/bailleul', ToolboxCorpusReader, r'bailleul.txt', encoding='utf-8') lexicon = ElementTree(bailleul.xml('bailleul.txt')) for file in propernames.fileids(): for e in ElementTree(propernames.xml(file)).findall('record'): ge = Element('ge') ge.text = e.find('lx').text e.append(ge) ps = Element('ps') ps.text = 'n.prop' e.append(ps) lexicon.getroot().append(e) wl = {} wl_detone = {} def normalize_bailleul(word): return u''.join([c for c in word if c not in u'.-'])
interrogazioni = LazyCorpusLoader( 'opp_interrogazioni_macro', CategorizedPlaintextCorpusReader, r'\d*', cat_file='cats.txt', cat_delimiter=',' ) print "computing FreqDist over all words" all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words()) top_words = all_words.keys()[:2000] print "generating list of documents for each category" documents = [ (list(interrogazioni.words(fileid)), category) for category in interrogazioni.categories() for fileid in interrogazioni.fileids(category) ] random.shuffle(documents) print "building the classifier" featuresets = [(document_features(d, top_words), c) for (d,c) in documents] train_set, test_set = featuresets[1000:], featuresets[:1000] classifier = nltk.NaiveBayesClassifier.train(train_set) print "classifier accuracy: ", nltk.classify.accuracy(classifier, test_set)
def loadClassifier(outputdir): classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") word_features_filename = os.path.join("pickled_algos", "word_features.pickle") if os.path.exists(classifier_filename) and os.path.exists(word_features_filename): word_features = pickleLoad("word_features.pickle") # classifier = pickleLoad("originalnaivebayes.pickle") # MNB_classifier = pickleLoad("MNB_classifier.pickle") # BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle") # LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle") # SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle") # LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle") # # voted_classifier = VoteClassifier(classifier, ## NuSVC_classifier, # LinearSVC_classifier, # SGDClassifier_classifier, # MNB_classifier, # BernoulliNB_classifier, # LogisticRegression_classifier) voted_classifier= pickleLoad("voted_classifier.pickle") return voted_classifier, word_features else: criticas_cine = LazyCorpusLoader( 'criticas_cine', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # criticas_cine = LazyCorpusLoader( # 'criticas_cine_neu', CategorizedPlaintextCorpusReader, # r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*', # encoding='utf-8') documents = [(list(criticas_cine.words(fileid)), category) for category in criticas_cine.categories() for fileid in criticas_cine.fileids(category)] # # document_pos = [(list(criticas_cine.words(fileid)), "pos") # for fileid in criticas_cine.fileids("pos")] # document_neg = [(list(criticas_cine.words(fileid)), "neg") # for fileid in criticas_cine.fileids("neg")] # document_neu = [(list(criticas_cine.words(fileid)), "neu") # for fileid in criticas_cine.fileids("neu")] random.shuffle(documents) # random.shuffle(document_pos) # random.shuffle(document_neg) # random.shuffle(document_neu) all_words = [] for w in criticas_cine.words(): all_words.append(w.lower()) # for w in criticas_cine.words(): # if not is_filtered(w.lower()): # all_words.append(w.lower()) # all_words = nltk.FreqDist(all_words) #print (all_words.most_common(50)) # Filtering by type of word # for sample in all_words: word_features = list(all_words.keys())[:3000] pickleDump(word_features, "word_features.pickle") featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents] # featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos] # featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg] # featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu] # training_set = featuresetpos[:1000] # training_set.extend(featuresetneg[:1000]) # training_set.extend(featuresetneu[:1000]) # testing_set = featuresetpos[1000:1273] # testing_set.extend(featuresetneg[1000:]) # testing_set.extend(featuresetneu[1000:]) # pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"] # neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"] # neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"] training_set = featuresets[:2000] testing_set = featuresets[2000:] classifier = nltk.NaiveBayesClassifier.train(training_set) # pickleDump(classifier, "originalnaivebayes.pickle") NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set) print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100) accuracy = Accuracy(classifier,testing_set) print(accuracy) # order: neu, neg, pos # print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3) # print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3) # print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3) # print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100) classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set) print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100) # pickleDump(MNB_classifier, "MNB_classifier.pickle") BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100) # pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle") LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set) print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100) # pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle") SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100) # pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle") LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set) print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # SVC_classifier = SklearnClassifier(SVC()) # SVC_classifier.train(training_set) # print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(training_set) NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set) print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # pickleDump([NaiveBayesClassifierAccuracy, # LinearSVC_classifierAccuracy, # SGDClassifier_classifierAccuracy, # MNB_classifierAccuracy, # BernoulliNB_classifierAccuracy, # LogisticRegression_classifierAccuracy], "accuracies.pickle") voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy], [NuSVC_classifier,NuSVC_classifierAccuracy], [LinearSVC_classifier,LinearSVC_classifierAccuracy], [SGDClassifier_classifier,SGDClassifier_classifierAccuracy], [MNB_classifier,MNB_classifierAccuracy], [BernoulliNB_classifier,BernoulliNB_classifierAccuracy], [LogisticRegression_classifier,LogisticRegression_classifierAccuracy]) accuracy = Accuracy(voted_classifier,testing_set) print(accuracy) VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set) print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100) # print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100) print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2) print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2) print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2) print("------------------------------------------"); pickleDump(voted_classifier, "voted_classifier.pickle") return voted_classifier, word_features
def summarize_cisco_support_forum_texts(): #cisco_plain_text = LazyCorpusLoader( # 'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1') cisco_plain_text = LazyCorpusLoader('cisco_forum_subset', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1') token_dict = {} for article in cisco_plain_text.fileids(): token_dict[article] = cisco_plain_text.raw(article) tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words='english', decode_error='ignore') sys.stdout.flush() #creates Compressed Sparse Row format numpy matrix tdm = tfidf.fit_transform(token_dict.values()) feature_names = tfidf.get_feature_names() #problem_statement_#1 - summarize support_forum articles automatically for article_id in range(0, tdm.shape[0] - 2): article_text = cisco_plain_text.raw( cisco_plain_text.fileids()[article_id]) sent_scores = [] for sentence in nltk.sent_tokenize(article_text): score = 0 sent_tokens = tokenize_and_stem(sentence) for token in (t for t in sent_tokens if t in feature_names): score += tdm[article_id, feature_names.index(token)] sent_scores.append((score / len(sent_tokens), sentence)) summary_length = int(math.ceil(len(sent_scores) / 5)) sent_scores.sort(key=lambda sent: sent[0]) print '\n*** SUMMARY ***' for summary_sentence in sent_scores[:summary_length]: print summary_sentence[1] print '\n*** ORIGINAL ***' print article_text #problem_statement_#2 - automatically categorize forum posts by tags into various groups reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200) #problem_statement_#3 - find similar documents to a current document (that user is reading) automatically #eg - quora: find similar questions, find similar answers cosine_similarity(tdm[0:1], tdm) ''' output looks like this array([[ 1. , 0.22185251, 0.0215558 , 0.03805012, 0.04796646, 0.05069365, 0.05507056, 0.03374501, 0.03643342, 0.05308392, 0.06002623, 0.0298806 , 0.04177088, 0.0844478 , 0.07951179, 0.02822186, 0.03036787, 0.11022385, 0.0535391 , 0.10009412, 0.07432719, 0.03753424, 0.06596462, 0.01256566, 0.02135591, 0.13931643, 0.03062681, 0.02595649, 0.04897851, 0.06276997, 0.03173952, 0.01822134, 0.04043555, 0.06629454, 0.05436211, 0.0549144 , 0.04400169, 0.05157118, 0.05409632, 0.09541703, 0.02473209, 0.05646599, 0.05728387, 0.04672681, 0.04519217, 0.04126276, 0.06289187, 0.03116767, 0.04828476, 0.04745193, 0.01404426, 0.04201325, 0.023492 , 0.07138136, 0.03778315, 0.03677206, 0.02553581]]) The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251 ''' cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten() #mapping back to document_name space related_docs_indices = cosine_similarities.argsort() ''' document_ids array([23, 50, 31, 24, 2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30, 7, 8, 55, 21, 54, 3, 32, 45, 12, 51, 36, 44, 43, 49, 4, 48, 28, 5, 37, 9, 18, 38, 34, 35, 6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13, 39, 19, 17, 25, 1, 0]) docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted) https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40 and supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest ''' cosine_similarities[related_docs_indices] for key, value in token_dict.iteritems(): print key, value #find the actual posts which are the most similar tfidf.inverse_transform(tdm)[0] tfidf.inverse_transform(tdm)[1]
train_test_ratio = 2.0/3 def pickleObject(): obj = classifier savefile = open('classifier.pickle', 'w') cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL) def pickleFeats(): obj = words_in_sentence savefile = open('feats.pickle', 'w') cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL) files_in_neg = movie_reviews.fileids('neg') files_in_pos = movie_reviews.fileids('pos') neg_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'neg') for f in files_in_neg] pos_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'pos') for f in files_in_pos] negative_first_test_pos = int(len(neg_data)*train_test_ratio) positive_first_test_pos = int(len(pos_data)*train_test_ratio) train_data = neg_data[:negative_first_test_pos] + pos_data[:positive_first_test_pos] test_data = neg_data[negative_first_test_pos:] + pos_data[positive_first_test_pos:] print 'training on %d paragraphs and testing on %d paragraphs' % (len(train_data), len(test_data)) classifier = NaiveBayesClassifier.train(train_data) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_data) classifier.show_most_informative_features(20)
from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords from nltk.text import TextCollection from collections import Counter from nltk.tokenize import word_tokenize from operator import itemgetter dialect = LazyCorpusLoader('dialects1', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(egyptian|gulf|levantine|standardArabic)/.*', encoding="utf-8") x = TextCollection(dialect) sentences = [(list(dialect.words(fileid)[i:i+40]),category) for category in dialect.categories() for fileid in dialect.fileids(category) for i in range(0,len(dialect.words(fileid)),40)] shuffled_sentences = random.sample(sentences, len(sentences)) print('sentences count',len(sentences)) text = dialect.words() print('words count',len(text)) #################### Test with getting topN ############################################################################ # all_words = nltk.FreqDist(w for w in dialect.words()) # Mcommon = all_words.most_common(4000) # topN = [i[0] for i in Mcommon] # print('finished topN') ########################################################################################################################
from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import WordListCorpusReader reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist.txt']) print(isinstance(reader, LazyCorpusLoader)) print(reader.fileids()) print(isinstance(reader, LazyCorpusLoader)) print(isinstance(reader, WordListCorpusReader))
def train_test_split(min=25): ''' Gets the train/test set for testing from the Reuters corpus. Keeps only the documents that have a category in both test and train set, with a possible user defined miniumum number of tokens. Parameters: ------------ min: (default = 25) The minimum number of tokens in documents to test with. Returns: ------------ train_set, train_target, test_set, test_target as lists ''' #imports import re from nltk.tokenize import RegexpTokenizer from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import CategorizedPlaintextCorpusReader #reading corpus reuters = LazyCorpusLoader('reuters', CategorizedPlaintextCorpusReader, '(training|test).*', cat_file='cats.txt', encoding='ISO-8859-2') documents = reuters.fileids() #spliting into train and test sets train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) #getting documents and their categories train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] train_cat = [reuters.categories(doc_id) for doc_id in train_docs_id] test_cat = [reuters.categories(doc_id) for doc_id in test_docs_id] #formating the train set, tokenizing and gathering stats for processing train_token_docs = [] train_token_docs_length = [] train_token_docs_unique = [] for i in train_docs: tempy_tokens = tokenize(i) train_token_docs.append(" ".join(tempy_tokens)) train_token_docs_length.append(len(tempy_tokens)) train_token_docs_unique.append(len(set(tempy_tokens))) #formating the test set, tokenizing and gathering stats for processing test_token_docs = [] test_token_docs_length = [] test_token_docs_unique = [] for i in test_docs: tempy_tokens = tokenize(i) test_token_docs.append(" ".join(tempy_tokens)) test_token_docs_length.append(len(tempy_tokens)) test_token_docs_unique.append(len(set(tempy_tokens))) #removes any documents that do not meet the minimum tokens setting train_less_than_min = [ n for n, i in enumerate(train_token_docs_length) if i < min ] test_less_than_min = [ n for n, i in enumerate(test_token_docs_length) if i < min ] train_token_docs_more_than_min = [ i for n, i in enumerate(train_token_docs) if n not in train_less_than_min ] test_token_docs_more_than_min = [ i for n, i in enumerate(test_token_docs) if n not in test_less_than_min ] train_cat_more_than_min = [ i for n, i in enumerate(train_cat) if n not in train_less_than_min ] test_cat_more_than_min = [ i for n, i in enumerate(test_cat) if n not in test_less_than_min ] #getting documents with single categories #(corpus has some with multiple categories) cat_count_train = [len(i) for i in train_cat_more_than_min] cat_count_test = [len(i) for i in test_cat_more_than_min] single_cat_train = [n for n, i in enumerate(cat_count_train) if i == 1] single_cat_test = [n for n, i in enumerate(cat_count_test) if i == 1] train_single = [ i for n, i in enumerate(train_token_docs_more_than_min) if n in single_cat_train ] test_single = [ i for n, i in enumerate(test_token_docs_more_than_min) if n in single_cat_test ] train_single_cat = [ i for n, i in enumerate(train_cat_more_than_min) if n in single_cat_train ] test_single_cat = [ i for n, i in enumerate(test_cat_more_than_min) if n in single_cat_test ] train_cat_set = set([i[0] for i in train_single_cat]) test_cat_set = set([i[0] for i in test_single_cat]) mutual_cat = train_cat_set.intersection(test_cat_set) member_of_mutual_test = [ n for n, i in enumerate(test_single_cat) if i[0] in mutual_cat ] member_of_mutual_train = [ n for n, i in enumerate(train_single_cat) if i[0] in mutual_cat ] train_single2 = [ i for n, i in enumerate(train_single) if n in member_of_mutual_train ] test_single2 = [ i for n, i in enumerate(test_single) if n in member_of_mutual_test ] train_single_cat2 = [ i for n, i in enumerate(train_single_cat) if n in member_of_mutual_train ] test_single_cat2 = [ i for n, i in enumerate(test_single_cat) if n in member_of_mutual_test ] return train_single2, train_single_cat2, test_single2, test_single_cat2
from nltk.data import path from nltk.tree import * path.extend(['./testing/']) s = '---' print(s.split('-')) icepahc = LazyCorpusLoader( 'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) fileids = icepahc.fileids() # TODO: Get fileid info per tree for tree ids # sents = icepahc.parsed_sents() def sent_text(sentence): ''' Takes in a nltk Tree object and returns the sentence text in string form ''' text = [] leaves = sentence.pos() for leaf in leaves: # print(leaf) if len(leaf[0]) == 1: leaf = leaf[1] elif leaf[0] == '---': leaf = '-' elif leaf[0] == '-----':
def main(): IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd' FIXED_IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd_fix' fix_annotation_errors(IcePaHC_DIR, FIXED_IcePaHC_DIR) run_pre(FIXED_IcePaHC_DIR) path.extend(['..']) ICEPAHC = LazyCorpusLoader( 'icepahc-v0.9/psd_fix/', IcePaHCFormatReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) fileids = ICEPAHC.fileids() # leave uncommented for whole corpus use # fileids = ['1150.homiliubok.rel-ser.psd'] # For debug use only # fileids = ['2008.mamma.nar-fic.psd', '2008.ofsi.nar-sag.psd'] # For debug use only # Instance of Converter class c = Converter(auto_tags='corpus') # c = Converter() total_sents = 0 file_num = 1 # OUTPUT_DIR = '../testing/CoNLLU_output/' OUTPUT_DIR = '../IcePaHC-CoNLLU/' if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) # f = open('ofsi_conllu.conllu', 'w+') ''' Prints the dependency graph data in conllU format ''' for fileid in fileids: OUT_FILE_NAME = re.sub(r'\.psd', '.conllu', fileid) OUT_FILE_PATH = OUTPUT_DIR + OUT_FILE_NAME OUT_FILE = open(OUT_FILE_PATH, 'w+') # file_length = len(ICEPAHC.parsed_sents(fileid)) error_num = 0 start = time.time() file_sents = 0 # print('\nProcessing file: {0}...'.format(fileid)) tree_counter = 0 tag_dict = tagged_corpus(ICEPAHC.parsed_sents(fileid)) c.set_tag_dict(tag_dict) to_join = [] try: for tree in ICEPAHC.parsed_sents(fileid): # Catch error in corpus where punctuation tokens are missing tree = fix_IcePaHC_tree_errors(tree) # UniversalDependencyGraph object created from tree dep = c.create_dependency_graph(tree) # Sentences split between clauses joined together and output written # to file if dep.get_by_address(len(dep.nodes)-1)['word'] not in {'.', ':', '?', '!', 'kafli', 'kapítuli'} \ and len(dep.nodes) != 1: to_join.append(dep) else: if len(to_join) == 0: # write out dep. graphs that don't need to be joined dep_c = c.add_space_after(dep).to_conllU() sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1) sent_id_line = '# sent_id = ' + sent_id + '\n' text_line = dep.plain_text()+'\n' # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n' icepahc_id_line = str(dep.original_ID_plain_text()) + '\n' OUT_FILE.write(sent_id_line) OUT_FILE.write(icepahc_id_line) OUT_FILE.write(text_line) OUT_FILE.write(dep_c) file_sents += 1 total_sents += 1 else: # write out joined dependency graphs to_join.append(dep) dep = c.add_space_after(c.join_graphs(to_join)) dep_c = dep.to_conllU() sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1) sent_id_line = '# sent_id = ' + sent_id + '\n' text_line = dep.plain_text()+'\n' # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n' icepahc_id_line = str(dep.original_ID_plain_text()) + '\n' OUT_FILE.write(sent_id_line) OUT_FILE.write(icepahc_id_line) OUT_FILE.write(text_line) OUT_FILE.write(dep_c) file_sents += 1 total_sents += 1 to_join = [] tree_counter += 1 except Exception as ex: print('ERROR', '# sent_id =', sent_id) print(tree.corpus_id) print(tree) print('Failure - {0}. Arguments:\n{1!r}'.format(type(ex).__name__, ex.args)) raise error_num += 1 run_post_file(OUT_FILE_PATH) end = time.time() duration = '%.2f' % float(end - start) # if error_num > 0: print('\t'.join([str(i) for i in [file_num, fileid, tree_counter, file_sents, error_num, str(duration)+' sec']])) file_num += 1
from collections import defaultdict import time import re import string path.extend(['./testing/']) DMII_combined = DMII_data.load_json('combined') # TODO: Move to features script icepahc = LazyCorpusLoader( 'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) if __name__ == '__main__': fileids = icepahc.fileids() # leave uncommented for whole corpus use #fileids = ['1350.bandamennM.nar-sag.psd'] # For debug use only c = Converter() # Creates instance of Converter class total_sents = 0 file_num = 1 # f = open('homilia_conllu.conllu', 'w+') ''' Prints the dependency graph data in conllU format ''' for fileid in fileids: error_num = 0 start = time.time() file_sents = 0 #print('\nProcessing file: {0}...'.format(fileid)) for tree in icepahc.parsed_sents(fileid): treeID = fileid + '_' + str(file_sents+1) + '_' + str(total_sents+1)
return source connection = pyodbc.connect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=E:\\farsnettest.mdb") c = connection.cursor() #c.execute("select number,example from shir") corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml') word=u'شیر' targ = 0 c.execute("select * from shir") for row in c: print row #out = codecs.open('d:\\shirham.txt','w','utf-8') for file in corpus.fileids(): # # #if num==1000: break for doc in corpus.xml(file).getchildren(): # cat=doc.getchildren()[3].text# text=doc.getchildren()[5].text newtext=correctPersianString(text) allwords=text.split() sents=newtext.split('.') for sent in sents: if word in sent.split(): targ+=1