def __init__(self, wiki_type, debug_flag=False): assert wiki_type in ['wiki', 'simplewiki', 'wiktionary', 'wikibooks', 'wikiversity'] self.wiki_type = wiki_type if self.wiki_type == 'wiki': min_chars_per_line, min_words_per_section = 50, 50 elif self.wiki_type == 'simplewiki': min_chars_per_line, min_words_per_section = 1, 1 elif self.wiki_type == 'wiktionary': min_chars_per_line, min_words_per_section = 1, 3 elif self.wiki_type == 'wikibooks': min_chars_per_line, min_words_per_section = 1, 10 elif self.wiki_type == 'wikiversity': min_chars_per_line, min_words_per_section = 1, 3 CorpusReader.__init__(self, min_chars_per_line=min_chars_per_line, min_words_per_section=min_words_per_section, debug_flag=debug_flag) if self.wiki_type == 'wiktionary': self.set_sections_to_use(['Noun'])
def __init__(self, path, tokenizer, search, write): start = time.time() cr = CorpusReader(path, tokenizer) cr.processFile() #performance metrics print("Index time: {:.2f} seconds".format(time.time() - start)) size = os.stat(path + '.bin').st_size print("Index Size on disk :", sizeof_fmt(size)) words = cr.index.keys() metrics(cr.indexer) print( f"Vocabulary: {len(words)} words, size: {sizeof_fmt(len(''.join(words)))}" ) if search != '': print(cr.indexer.search(search)) if write: cr.indexer.writeIndexToFile(f"{path}_indexer.txt")
from CorpusReader import CorpusReader from utils import * from stanford import CoreNLP from dependency_tree import build_tree from dependency_similarity import * from feature import * from NeuralLearner import * import torch import torch.nn as nn import numpy as np import sys import xgboost as xgb # data pre-processing if False: reader = CorpusReader('data/test-set.txt') train_data = reader.data() for _, item in train_data.items(): item['token1'] = tokenized_sentence(item['Sentence1']) item['token2'] = tokenized_sentence(item['Sentence2']) corenlp = CoreNLP(sys.argv) corenlp.start_server() for k, item in train_data.items(): print(k) item['d-tree1'] = corenlp.dependency_parse_tree( list_to_string(item['token1'])) item['d-tree2'] = corenlp.dependency_parse_tree( list_to_string(item['token2']))
def read(self, htmldir, outfile, stop_words=set(), pos_words=set(), page_name_word_sets=None, corpus_words=None, page_title_ignore_suffixes=['-1', '-2', '- Advanced'], ignore_sections=set(), min_pos_words_in_page_name=0, min_pos_words_in_section=0, use_all_pages_match_pos_word=False, use_all_pages_match_sets=False, always_use_first_section=False, action='write'): # reset the class variables every time since these class variables are static variables that belongs to the Class, not a particular class object self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=page_name_word_sets, corpus_words=corpus_words, min_pos_words_in_page_name=min_pos_words_in_page_name, min_pos_words_in_section=min_pos_words_in_section, use_all_pages_match_pos_word=use_all_pages_match_pos_word, use_all_pages_match_sets=use_all_pages_match_sets, always_use_first_section=always_use_first_section, action=action) parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False) # the action variable is 'write', so _start_action will open the output file and write to it self._start_action() page_name, section_name, section_in_page = None, None, 0 page_name_words, section_words = [], [] start_time = time.time() # we only include x.html while x is a scalar, meaning we ignore the table html filenames = ['%s/%s'%(htmldir,fname) for fname in os.listdir(htmldir) if re.match(r'(\d+).html', fname) != None] assert len(filenames)>0 for ifname,fname in enumerate(filenames): print 'Reading %s' % fname with open(fname, 'rb') as myfile: # this is a very long string text = myfile.read() soup = BeautifulSoup(text, 'lxml') if soup.h1 is None: print 'Could not find page title in file %s - skipping' % fname continue # note that the html file could have many h1 tags, while only the first one is the title page_name = soup.h1.text.strip() # e.g some of the page name has Momentum-1, where the suffix '-1' should be eliminated for ptis in page_title_ignore_suffixes: if page_name.endswith(ptis): page_name = page_name[:-len(ptis)] break page_name_words = parser.parse(page_name) # page name = surface processes and landforms __0 # this is write fo file with the page name page_name = CorpusReader.part_name_from_words(page_name_words, ifname) print 'page name = %s' % page_name self._add_page(page_name, page_name_words) # using the section_re to split the text(without title) parts = re.split('(<h[1-4])', text) # start from 3 because the first 3 parts belong to the title <h1> tag, which should be skipped for ipart in range(3,len(parts),2): # odd number of parts are splitter tags # even number of parts are the contents of the tag soup = BeautifulSoup(parts[ipart] + parts[ipart+1], 'lxml') section_name = soup.find(parts[ipart][1:]).text.strip().lower() # some section that has name that matches set(['review', 'practice', 'references', 'explore more.*']) # we know this is a review section that does not contains information about science knowledge if np.any([(re.match(isr, section_name) is not None) for isr in ignore_sections]): continue section_name_words = parser.parse(section_name) section_in_page = (ipart - 1) / 2 # only select text from all the <p> tags within each section text = '' for p in soup.find_all('p'): text += p.next.strip() # this will replace some of the symbols to Eng, e.g 'Δ' -> 'Delta' text = HtmlReader.parse_text(text) # word tokenizing words = parser.parse(text) section_words = words # for each filename, add those sections, which is write to files # note that section_name is not written to file. self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words) end_time = time.time() print 'read_html total time = %.1f secs.' % (end_time-start_time) print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action) self._end_action() return self._locdic
"WARNING: You have a CUDA device, so you should probably run with --gpu" ) else: if args.gpu: print( "You do not have a GPU device, so you should run CPU without --gpu option." ) exit() if 'word' not in args.features_level and 'character' not in args.features_level: exit( "features_level argument is empty. It should include at least one of [word,character] items." ) torch.manual_seed(args.seed) corpus_train_reader = CorpusReader(args.corpus_train_file, 1000000) # 100MB corpus_dev_reader = CorpusReader(args.corpus_dev_file, 1000000) # 100MB dictionary_word = DictionaryWord() dictionary_char = None if 'character' in args.features_level: dictionary_char = DictionaryCharacter() model = None # Load the pre-trained Model for fine-tuning if path.exists(args.output_model_path): print("Loading pre-trained Model...") model = LanguageModel.load_model( use_gpu=args.gpu, path_to_pretrained_model=args.output_model_path)
from levenshtein import levenshtein as lev from collections import Counter from pdb import set_trace from Header import Header header = Header() SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair', 'ead ar', 'adar') books = False sentences = True print "Loading Corpora..." if books: print "\tloading munster" M = CorpusReader('munster') print "\tloading connacht" C = CorpusReader('connacht', M.countBooks()) C.truncateBooks(M.countBooks()) print "\tloading ulster" U = CorpusReader('ulster', M.countBooks()) U.truncateBooks(M.countBooks()) l = [U, M, C] #print "Done." if sentences: print "Creating Balanced Set of sentences" M = CorpusReader('munster') C = CorpusReader('connacht') U = CorpusReader('ulster') l = [U, M, C] MIN_LENG = min([x.countSentences() for x in l])
import pandas as pd from CorpusReader import CorpusReader import os file_dir = os.path.dirname(os.path.abspath(__file__)) train_file_path = os.path.join(file_dir, 'train_df.csv') test_file_path = os.path.join(file_dir, 'test_df.csv') train_cr = CorpusReader('./dataset/semeval_train.txt') train_features = train_cr.feature_extract() test_cr = CorpusReader('./dataset/semeval_test.txt') test_features = test_cr.feature_extract() train_df = pd.DataFrame([t.__dict__ for t in train_features]) train_df.to_csv(train_file_path, index=False, header=True) test_df = pd.DataFrame([t.__dict__ for t in test_features]) test_df.to_csv(test_file_path, index=False, header=True)
def LoadCorpus(self): self.cr=CorpusReader(self.readCorpusFilePath) id,corpu=self.cr.getCorpu(1) self.UpdataCorpu(id,corpu)
class MyMainWindow(QMainWindow, Ui_MainWindow): def __init__(self, parent=None): super(MyMainWindow, self).__init__(parent) self.setupUi(self) self.btnLoadPlugeDict.hide() self.viewList = list() self.PluDictViewList = list() self.historyInputList = list() #词典列表 self.PlugeDictionary=list() ''' 读取配置文件 ''' self.ReadConfig() self.checkBoxPlugeDict.stateChanged.connect(self.LoadPlugeDict) self.btnExcel2Json.clicked.connect(self.Excel2Json) self.btnLoadJson.clicked.connect(self.getAttrFilePath) self.btnLoadCorpus.clicked.connect(self.getReadCorpusFilePath) self.btnPreviousPage.clicked.connect(self.PreviousPage) self.btnNextPage.clicked.connect(self.NextPage) self.btnJumpPage.clicked.connect(self.JumpPage) self.btnSaveCorpus.clicked.connect(self.getSaveCorpusFilePath) self.btnCommit.clicked.connect(self.Commit) self.btnLoadPlugeDict.clicked.connect(self.getPlugeDictionaryFilePath) self.btnOpenInputHistory.clicked.connect(self.ShowInputHistoryDock) self.btnClearInputHistory.clicked.connect(self.ClearInputHistory) self.btnResearch.clicked.connect(self.Research) self.lineEditResearch.textChanged.connect(self.Research) self.listWidgetResearch.itemClicked.connect(self.CommitResearch) self.btnEntityResearch.clicked.connect(self.EntityResearch) self.SetHistoryInputDock() def CommitResearch(self,qModelIndex): self.UpdateAttr(1) text=self.sourceText text=text.split('}')[0] tlist=self.listWidgetResearch.selectedItems() tlist=[t.text() for t in list(tlist)] text=text+','+','.join(tlist)+'}' self.lineEditResult.setText(text) def Research(self): self.listWidgetResearch.clear() self.sourceText=self.lineEditResult.text() compileString=self.lineEditResearch.text() #print(compileString) cp=re.compile('%s'%compileString) for items in self.PlugeDictionary: for item in items: #print(item) if len(re.findall(cp,item))!=0: self.listWidgetResearch.addItem(item) def ClearInputHistory(self): self.historyInputListWidget.clear() self.historyInputList=[] def ShowInputHistoryDock(self): self.dockWidgetHistoryInput.show() def SetHistoryInputDock(self): self.historyInputListWidget=QListWidget() self.dockWidgetHistoryInput.setWidget(self.historyInputListWidget) self.historyInputListWidget.addItems(self.historyInputList) self.historyInputListWidget.itemClicked.connect(self.HistoryInput2lineEditResult) def HistoryInput2lineEditResult(self,qModelIndex): #print(qModelIndex) tlist=self.historyInputListWidget.selectedItems() text=[t.text() for t in list(tlist)] self.lineEditResult.setText(','.join(text)) def LoadPlugeDict(self): if self.checkBoxPlugeDict.isChecked(): self.btnLoadPlugeDict.show() for listWidget in self.PluDictViewList: listWidget.show() else: self.btnLoadPlugeDict.hide() for listWidget in self.PluDictViewList: listWidget.hide() self.UpdateAttr('') def Excel2Json(self): ex = excel2json(self) ex.show() def getAttrFilePath(self): tmpPath, _ = QFileDialog.getOpenFileName(self, r'打开JSON', r'./', r'JSON File(*.json)') if tmpPath !='': self.attrFilePath=tmpPath self.LoadJson() def getReadCorpusFilePath(self): tmpPath, _ = QFileDialog.getOpenFileName(self, r'打开语料', r'./', r'Excel File(*.xls *.xlsx)') if tmpPath !='': self.readCorpusFilePath=tmpPath self.LoadCorpus() def getSaveCorpusFilePath(self): filepath,_ = QFileDialog.getSaveFileName(self, r'保存语料', r'./', r'Excel File (*.xls)') tmpPath=filepath if tmpPath !='': self.saveCorpusFilePath=tmpPath self.SaveCorpus() def getPlugeDictionaryFilePath(self): tmpPath, _ = QFileDialog.getOpenFileName(self, r'打开JSON', r'./', r'JSON File(*.json)') if tmpPath !='': self.plugeDictionaryFilePath=tmpPath self.PlugeDict() def LoadJson(self): self.jr=JsonReader(self.attrFilePath) self.ShowEntity() def EntityResearch(self): try: self.comboBoxEntitys.currentIndexChanged.disconnect(self.ShowAttribute) except: print('the slot dont have sign') entityList = self.jr.getEntityList() string = self.lineEditEntityResearch.text() cp = re.compile(string) researchList = list() for entity in entityList: if re.findall(cp,entity) != []: researchList.append(entity) self.comboBoxEntitys.clear() self.comboBoxEntitys.addItems(researchList) self.comboBoxEntitys.currentIndexChanged.connect(self.ShowAttribute) def ShowEntity(self): try: self.comboBoxEntitys.currentIndexChanged.disconnect(self.ShowAttribute) except: print('the slot dont have sign') self.comboBoxEntitys.clear() self.comboBoxEntitys.addItems(self.jr.getEntityList()) self.comboBoxEntitys.currentIndexChanged.connect(self.ShowAttribute) def ShowAttribute(self,i): entity=self.comboBoxEntitys.currentText() attrList,attrListLength = self.jr.getCurrentEntityAttributeList(entity) #print(attrListLength) if len(self.viewList) < attrListLength: listIndex = -1 for view in self.viewList: slm = QStringListModel() listIndex+=1 view.clear() view.addItems(self.jr.getAttributeItems(entity,listIndex)) widgetColumnIndex=listIndex widgetRowIndex=0 for i in range(attrListLength-len(self.viewList)): #print(slm) listIndex+=1 widgetColumnIndex+=1 if widgetColumnIndex > 4: widgetRowIndex+=1 widgetColumnIndex=0 print(r'widgetRowIndex:{},widgetColumnIndex:{}'.format(widgetRowIndex,widgetColumnIndex)) tmpListView = QListWidget() tmpListView.addItems(self.jr.getAttributeItems(entity,listIndex)) tmpListView.itemClicked.connect(self.UpdateAttr) # 按住CTRL可多选 tmpListView.setSelectionMode(QAbstractItemView.ExtendedSelection) self.viewList.append(tmpListView) self.gridLayoutAttrListView.addWidget(tmpListView,widgetRowIndex,widgetColumnIndex) else: for index in range(attrListLength): self.viewList[index].clear() self.viewList[index].addItems(self.jr.getAttributeItems(entity,index)) self.viewList[index].show() for index in range(attrListLength,len(self.viewList)): self.viewList[index].hide() def UpdateAttr(self,qModelIndex): result=[] for view in self.viewList: tlist = view.selectedItems() text = [t.text() for t in list(tlist)] result+=text if self.checkBoxPlugeDict.isChecked(): for pdview in self.PluDictViewList: tlist = pdview.selectedItems() text = [t.text() for t in list(tlist)] result+=text self.lineEditResult.setText('{'+','.join(result)+'}') def LoadCorpus(self): self.cr=CorpusReader(self.readCorpusFilePath) id,corpu=self.cr.getCorpu(1) self.UpdataCorpu(id,corpu) #上一页 def PreviousPage(self): id=self.cr.getId() id,corpu=self.cr.getCorpu(id-1) self.UpdataCorpu(id,corpu) self.lineEditResult.setText(self.cr.getCorpuResult(id)) #下一页 def NextPage(self): id=self.cr.getId() id,corpu=self.cr.getCorpu(id+1) self.UpdataCorpu(id,corpu) self.lineEditResult.setText(self.cr.getCorpuResult(id)) #页面跳转 def JumpPage(self): id=int(self.lineEditPage.text()) _,corpu=self.cr.getCorpu(id) self.UpdataCorpu(id,corpu) self.lineEditResult.setText(self.cr.getCorpuResult(id)) def UpdataCorpu(self,id,corpu): self.lblNo.setText(str(id)) self.lblSourceCorpu.setText(corpu) def SaveCorpus(self): self.cr.setSavePath(self.saveCorpusFilePath) #提交结果 def Commit(self): id=self.cr.getId() corpu=self.lineEditResult.text() self.cr.Commit(id,corpu) self.cr.Save() '''将输入加载到历史输入框中''' if corpu not in self.historyInputList: self.historyInputList.append(corpu) self.historyInputListWidget.addItem(corpu) def PlugeDict(self): self.plugeDictjr=JsonReader(self.plugeDictionaryFilePath) self.plugeDictAttrList=self.plugeDictjr.getEntityList() widgetRowIndex=0 widgetColumnIndex=-1 for attrTitle in self.plugeDictAttrList: #print(self.plugeDictjr.getPlugeDictAttrItems(attrTitle)) widgetColumnIndex+=1 if widgetColumnIndex > 4: widgetRowIndex+=1 widgetColumnIndex=0 print(r'widgetRowIndex:{},widgetColumnIndex:{}'.format(widgetRowIndex,widgetColumnIndex)) tmpListView = QListWidget() item=self.plugeDictjr.getPlugeDictAttrItems(attrTitle) self.PlugeDictionary.append(item) tmpListView.addItems(item) tmpListView.itemClicked.connect(self.UpdateAttr) # 按住CTRL可多选 tmpListView.setSelectionMode(QAbstractItemView.ExtendedSelection) self.PluDictViewList.append(tmpListView) tmpListView.hide() self.gridLayoutPlugeDictListView.addWidget(tmpListView,widgetRowIndex,widgetColumnIndex) def closeEvent(self, event): """ 重写closeEvent方法,实现dialog窗体关闭时执行一些代码 :param event: close()触发的事件 :return: None """ configItems=list() try: configItems.append('historyInputList&&'+"||".join(self.historyInputList)) configItems.append('readCorpusFilePath&&'+self.readCorpusFilePath) configItems.append('attrFilePath&&'+self.attrFilePath) configItems.append('plugeDictionaryFilePath&&'+self.plugeDictionaryFilePath) configItems.append('saveCorpusFilePath&&'+self.saveCorpusFilePath) except: print('存在设置没有设定') self.configFile.WriteConfig(configItems) def ReadConfig(self): self.configFile = ConfigFile() configItems=self.configFile.ReadConfig() for item in configItems: part=item.strip('\n').split('&&') if len(part)!=1: if part[0]=='attrFilePath': self.attrFilePath=part[1] self.LoadJson() elif part[0]=='plugeDictionaryFilePath': self.plugeDictionaryFilePath=part[1] self.PlugeDict() elif part[0]=='saveCorpusFilePath': self.saveCorpusFilePath=part[1] self.SaveCorpus() elif part[0]=='readCorpusFilePath': self.readCorpusFilePath=part[1] self.LoadCorpus() elif part[0]=='historyInputList': self.historyInputList=part[1].split('||')
print total_num, error_num return float(total_num - error_num) / (total_num) if __name__ == '__main__': brill_tagger = None mle_tagger = None try: fin = open('mletagger.model', 'rb') mle_tagger = load(fin) print "MLE tagger loaded" fin.close() # model doesn't exist except IOError: print "MLE model not found! Retraining..." print "Loading training corpus..." train_corpus = CorpusReader.readin('train.pos') print "Corpus loaded" print "Learning MLE tagger..." model = MLETagLearner.learn(train_corpus) mle_tagger = MLETagger(model) print "MLE tagger learned" fout = open('mletagger.model', 'wb') dump(mle_tagger, fout, -1) fout.close() try: fin = open('brilltagger.model', 'rb') brill_tagger = load(fin) print "Brill tagger loaded!" fin.close()
print('') print('-printing each lemma-') nlpPipeLine.createLemma(nlp, sentTest) print('') print('-printing each POS tag-') nlpPipeLine.createPOS(nlp, sentTest) print('') print('-printing all Dependency parse tree-') nlpPipeLine.createDepParse(nlp,sentTest) data_folder_train = Path("data/train-set.txt") trainCorpusObject = CorpusReader(data_folder_train) data_folder_test = Path("data/dev-set.txt") devCorpusObject = CorpusReader(data_folder_test) mlObject = MachineLearningTasks(trainCorpusObject, devCorpusObject) #do the nlp pipeline for each parah in corpusObject #store in the appropriate HashMap dict """a = 0 for corpusParah in trainCorpusObject.corpus: #doc1 = nlp(corpusParah.hm1["sent"]) #doc2 = nlp(corpusParah.hm2["sent"]) if(a==2):
from CorpusReader import CorpusReader from levenshtein import levenshtein as lev from collections import Counter from pdb import set_trace from Header import Header header = Header() SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair', 'ead ar', 'adar') books = False sentences = True print "Loading Corpora..." if books: print "\tloading munster" M = CorpusReader('munster') print "\tloading connacht" C = CorpusReader('connacht',M.countBooks()) C.truncateBooks(M.countBooks()) print "\tloading ulster" U = CorpusReader('ulster', M.countBooks()) U.truncateBooks(M.countBooks()) l = [U,M,C] #print "Done." if sentences: print "Creating Balanced Set of sentences" M = CorpusReader('munster') C = CorpusReader('connacht') U = CorpusReader('ulster') l = [U,M,C] MIN_LENG = min([x.countSentences() for x in l])