示例#1
0
 def __init__(self, wiki_type, debug_flag=False):
     assert wiki_type in ['wiki', 'simplewiki', 'wiktionary', 'wikibooks', 'wikiversity']
     self.wiki_type = wiki_type
     if self.wiki_type == 'wiki':
         min_chars_per_line, min_words_per_section = 50, 50
     elif self.wiki_type == 'simplewiki':
         min_chars_per_line, min_words_per_section = 1, 1
     elif self.wiki_type == 'wiktionary':
         min_chars_per_line, min_words_per_section = 1, 3
     elif self.wiki_type == 'wikibooks':
         min_chars_per_line, min_words_per_section = 1, 10
     elif self.wiki_type == 'wikiversity':
         min_chars_per_line, min_words_per_section = 1, 3
     CorpusReader.__init__(self, min_chars_per_line=min_chars_per_line, min_words_per_section=min_words_per_section, debug_flag=debug_flag)
     if self.wiki_type == 'wiktionary':
         self.set_sections_to_use(['Noun'])
    def __init__(self, path, tokenizer, search, write):
        start = time.time()

        cr = CorpusReader(path, tokenizer)
        cr.processFile()

        #performance metrics
        print("Index time: {:.2f} seconds".format(time.time() - start))
        size = os.stat(path + '.bin').st_size
        print("Index Size on disk :", sizeof_fmt(size))
        words = cr.index.keys()
        metrics(cr.indexer)

        print(
            f"Vocabulary: {len(words)} words, size: {sizeof_fmt(len(''.join(words)))}"
        )
        if search != '':
            print(cr.indexer.search(search))
        if write:
            cr.indexer.writeIndexToFile(f"{path}_indexer.txt")
示例#3
0
from CorpusReader import CorpusReader
from utils import *
from stanford import CoreNLP
from dependency_tree import build_tree
from dependency_similarity import *
from feature import *
from NeuralLearner import *
import torch
import torch.nn as nn
import numpy as np
import sys
import xgboost as xgb

# data pre-processing
if False:
    reader = CorpusReader('data/test-set.txt')

    train_data = reader.data()

    for _, item in train_data.items():
        item['token1'] = tokenized_sentence(item['Sentence1'])
        item['token2'] = tokenized_sentence(item['Sentence2'])

    corenlp = CoreNLP(sys.argv)
    corenlp.start_server()
    for k, item in train_data.items():
        print(k)
        item['d-tree1'] = corenlp.dependency_parse_tree(
            list_to_string(item['token1']))
        item['d-tree2'] = corenlp.dependency_parse_tree(
            list_to_string(item['token2']))
示例#4
0
    def read(self, htmldir, outfile, stop_words=set(), pos_words=set(), page_name_word_sets=None, corpus_words=None,
             page_title_ignore_suffixes=['-1', '-2', '- Advanced'],
             ignore_sections=set(),
             min_pos_words_in_page_name=0, min_pos_words_in_section=0,
             use_all_pages_match_pos_word=False, use_all_pages_match_sets=False, always_use_first_section=False,
             action='write'):

        # reset the class variables every time since these class variables are static variables that belongs to the Class, not a particular class object
        self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=page_name_word_sets, corpus_words=corpus_words,
                    min_pos_words_in_page_name=min_pos_words_in_page_name, min_pos_words_in_section=min_pos_words_in_section,
                    use_all_pages_match_pos_word=use_all_pages_match_pos_word, use_all_pages_match_sets=use_all_pages_match_sets,
                    always_use_first_section=always_use_first_section,
                    action=action)

        parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False)
        # the action variable is 'write', so _start_action will open the output file and write to it
        self._start_action()
        page_name, section_name, section_in_page = None, None, 0
        page_name_words, section_words = [], []
        start_time = time.time()
        # we only include x.html while x is a scalar, meaning we ignore the table html
        filenames = ['%s/%s'%(htmldir,fname) for fname in os.listdir(htmldir) if re.match(r'(\d+).html', fname) != None]
        assert len(filenames)>0
        for ifname,fname in enumerate(filenames):
            print 'Reading %s' % fname
            with open(fname, 'rb') as myfile:
                # this is a very long string
                text = myfile.read()
            soup = BeautifulSoup(text, 'lxml')
            if soup.h1 is None:
                print 'Could not find page title in file %s - skipping' % fname
                continue
            # note that the html file could have many h1 tags, while only the first one is the title
            page_name = soup.h1.text.strip()
            # e.g some of the page name has Momentum-1, where the suffix '-1' should be eliminated
            for ptis in page_title_ignore_suffixes:
                if page_name.endswith(ptis):
                    page_name = page_name[:-len(ptis)]
                    break
            page_name_words = parser.parse(page_name)
            # page name = surface processes and landforms __0
            # this is write fo file with the page name
            page_name = CorpusReader.part_name_from_words(page_name_words, ifname)
            print 'page name = %s' % page_name
            self._add_page(page_name, page_name_words)
            # using the section_re to split the text(without title)
            parts = re.split('(<h[1-4])', text)
            # start from 3 because the first 3 parts belong to the title <h1> tag, which should be skipped
            for ipart in range(3,len(parts),2):
                # odd number of parts are splitter tags
                # even number of parts are the contents of the tag
                soup = BeautifulSoup(parts[ipart] + parts[ipart+1], 'lxml')
                section_name = soup.find(parts[ipart][1:]).text.strip().lower()
                # some section that has name that matches set(['review', 'practice', 'references', 'explore more.*'])
                # we know this is a review section that does not contains information about science knowledge
                if np.any([(re.match(isr, section_name) is not None) for isr in ignore_sections]):
                    continue
                section_name_words = parser.parse(section_name)
                section_in_page = (ipart - 1) / 2
                # only select text from all the <p> tags within each section
                text = ''
                for p in soup.find_all('p'):
                    text += p.next.strip()
                # this will replace some of the symbols to Eng, e.g '&#916;' -> 'Delta'
                text = HtmlReader.parse_text(text)
                # word tokenizing
                words = parser.parse(text)
                section_words = words
                # for each filename, add those sections, which is write to files
                # note that section_name is not written to file.
                self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words)

        end_time = time.time()
        print 'read_html total time = %.1f secs.' % (end_time-start_time)
        print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action)
        self._end_action()

        return self._locdic
示例#5
0
            "WARNING: You have a CUDA device, so you should probably run with --gpu"
        )
else:
    if args.gpu:
        print(
            "You do not have a GPU device, so you should run CPU without --gpu option."
        )
        exit()

if 'word' not in args.features_level and 'character' not in args.features_level:
    exit(
        "features_level argument is empty. It should include at least one of [word,character] items."
    )

torch.manual_seed(args.seed)
corpus_train_reader = CorpusReader(args.corpus_train_file, 1000000)  # 100MB
corpus_dev_reader = CorpusReader(args.corpus_dev_file, 1000000)  # 100MB

dictionary_word = DictionaryWord()
dictionary_char = None

if 'character' in args.features_level:
    dictionary_char = DictionaryCharacter()

model = None
# Load the pre-trained Model for fine-tuning
if path.exists(args.output_model_path):

    print("Loading pre-trained Model...")
    model = LanguageModel.load_model(
        use_gpu=args.gpu, path_to_pretrained_model=args.output_model_path)
from levenshtein import levenshtein as lev
from collections import Counter
from pdb import set_trace
from Header import Header

header = Header()
SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair',
        'ead    ar', 'adar')

books = False
sentences = True

print "Loading Corpora..."
if books:
    print "\tloading munster"
    M = CorpusReader('munster')
    print "\tloading connacht"
    C = CorpusReader('connacht', M.countBooks())
    C.truncateBooks(M.countBooks())
    print "\tloading ulster"
    U = CorpusReader('ulster', M.countBooks())
    U.truncateBooks(M.countBooks())
    l = [U, M, C]
#print "Done."
if sentences:
    print "Creating Balanced Set of sentences"
    M = CorpusReader('munster')
    C = CorpusReader('connacht')
    U = CorpusReader('ulster')
    l = [U, M, C]
    MIN_LENG = min([x.countSentences() for x in l])
示例#7
0
import pandas as pd
from CorpusReader import CorpusReader

import os

file_dir = os.path.dirname(os.path.abspath(__file__))
train_file_path = os.path.join(file_dir, 'train_df.csv')
test_file_path = os.path.join(file_dir, 'test_df.csv')

train_cr = CorpusReader('./dataset/semeval_train.txt')
train_features = train_cr.feature_extract()

test_cr = CorpusReader('./dataset/semeval_test.txt')
test_features = test_cr.feature_extract()

train_df = pd.DataFrame([t.__dict__ for t in train_features])
train_df.to_csv(train_file_path, index=False, header=True)

test_df = pd.DataFrame([t.__dict__ for t in test_features])
test_df.to_csv(test_file_path, index=False, header=True)

示例#8
0
 def LoadCorpus(self):
     self.cr=CorpusReader(self.readCorpusFilePath)
     id,corpu=self.cr.getCorpu(1)
     self.UpdataCorpu(id,corpu)
示例#9
0
class MyMainWindow(QMainWindow, Ui_MainWindow):
    def __init__(self, parent=None):    
        super(MyMainWindow, self).__init__(parent)
        self.setupUi(self)
        self.btnLoadPlugeDict.hide()

        self.viewList = list()
        self.PluDictViewList = list()
        self.historyInputList = list()
        #词典列表
        self.PlugeDictionary=list()
        '''
        读取配置文件
        '''
        self.ReadConfig()

        self.checkBoxPlugeDict.stateChanged.connect(self.LoadPlugeDict)
        self.btnExcel2Json.clicked.connect(self.Excel2Json)
        self.btnLoadJson.clicked.connect(self.getAttrFilePath)
        self.btnLoadCorpus.clicked.connect(self.getReadCorpusFilePath)
        self.btnPreviousPage.clicked.connect(self.PreviousPage)
        self.btnNextPage.clicked.connect(self.NextPage)
        self.btnJumpPage.clicked.connect(self.JumpPage)
        self.btnSaveCorpus.clicked.connect(self.getSaveCorpusFilePath)
        self.btnCommit.clicked.connect(self.Commit)
        self.btnLoadPlugeDict.clicked.connect(self.getPlugeDictionaryFilePath)
        self.btnOpenInputHistory.clicked.connect(self.ShowInputHistoryDock)
        self.btnClearInputHistory.clicked.connect(self.ClearInputHistory)
        self.btnResearch.clicked.connect(self.Research)
        self.lineEditResearch.textChanged.connect(self.Research)
        self.listWidgetResearch.itemClicked.connect(self.CommitResearch)
        self.btnEntityResearch.clicked.connect(self.EntityResearch)
        self.SetHistoryInputDock()

    def CommitResearch(self,qModelIndex):
        self.UpdateAttr(1)
        text=self.sourceText
        text=text.split('}')[0]
        tlist=self.listWidgetResearch.selectedItems()
        tlist=[t.text() for t in list(tlist)]
        text=text+','+','.join(tlist)+'}'
        self.lineEditResult.setText(text)

    def Research(self):
        self.listWidgetResearch.clear()
        self.sourceText=self.lineEditResult.text()
        compileString=self.lineEditResearch.text()
        #print(compileString)
        cp=re.compile('%s'%compileString)
        for items in self.PlugeDictionary:
            for item in items:
               #print(item)
               if len(re.findall(cp,item))!=0:
                   self.listWidgetResearch.addItem(item)

    def ClearInputHistory(self):
        self.historyInputListWidget.clear()
        self.historyInputList=[]

    def ShowInputHistoryDock(self):
        self.dockWidgetHistoryInput.show()

    def SetHistoryInputDock(self):
        self.historyInputListWidget=QListWidget()
        self.dockWidgetHistoryInput.setWidget(self.historyInputListWidget)
        self.historyInputListWidget.addItems(self.historyInputList)
        self.historyInputListWidget.itemClicked.connect(self.HistoryInput2lineEditResult)

    def HistoryInput2lineEditResult(self,qModelIndex):
        #print(qModelIndex)
        tlist=self.historyInputListWidget.selectedItems()
        text=[t.text() for t in list(tlist)]
        self.lineEditResult.setText(','.join(text))

    def LoadPlugeDict(self):
        if self.checkBoxPlugeDict.isChecked():
            self.btnLoadPlugeDict.show()
            for listWidget in self.PluDictViewList:
                listWidget.show()
        else:
            self.btnLoadPlugeDict.hide()
            for listWidget in self.PluDictViewList:
                listWidget.hide()

        self.UpdateAttr('')

    def Excel2Json(self):
        ex = excel2json(self)
        ex.show()

    def getAttrFilePath(self):
        tmpPath, _ = QFileDialog.getOpenFileName(self,
                                                  r'打开JSON',
                                                  r'./',
                                                  r'JSON File(*.json)')
        if tmpPath !='':
            self.attrFilePath=tmpPath
            self.LoadJson()

    def getReadCorpusFilePath(self):
        tmpPath, _ = QFileDialog.getOpenFileName(self,
                                                  r'打开语料',
                                                  r'./',
                                                  r'Excel File(*.xls *.xlsx)')
        if tmpPath !='':
            self.readCorpusFilePath=tmpPath
            self.LoadCorpus()

    def getSaveCorpusFilePath(self):
        filepath,_ = QFileDialog.getSaveFileName(self,
                                                r'保存语料',
                                                r'./',
                                                r'Excel File (*.xls)')
        tmpPath=filepath
        if tmpPath !='':
            self.saveCorpusFilePath=tmpPath
            self.SaveCorpus()

    def getPlugeDictionaryFilePath(self):
        tmpPath, _ = QFileDialog.getOpenFileName(self,
                                                  r'打开JSON',
                                                  r'./',
                                                  r'JSON File(*.json)')
        if tmpPath !='':
            self.plugeDictionaryFilePath=tmpPath
            self.PlugeDict()

    def LoadJson(self):
        self.jr=JsonReader(self.attrFilePath)
        self.ShowEntity()

    def EntityResearch(self):
        try:
            self.comboBoxEntitys.currentIndexChanged.disconnect(self.ShowAttribute)
        except:
            print('the slot dont have sign')

        entityList = self.jr.getEntityList()
        string = self.lineEditEntityResearch.text()
        cp = re.compile(string)
        researchList = list()
        for entity in entityList:
            if re.findall(cp,entity) != []:
                researchList.append(entity)
        self.comboBoxEntitys.clear()
        self.comboBoxEntitys.addItems(researchList)
        self.comboBoxEntitys.currentIndexChanged.connect(self.ShowAttribute)

    def ShowEntity(self):
        try:
            self.comboBoxEntitys.currentIndexChanged.disconnect(self.ShowAttribute)
        except:
            print('the slot dont have sign')
        self.comboBoxEntitys.clear()
        self.comboBoxEntitys.addItems(self.jr.getEntityList())
        self.comboBoxEntitys.currentIndexChanged.connect(self.ShowAttribute)
    
    def ShowAttribute(self,i):
        entity=self.comboBoxEntitys.currentText()

        attrList,attrListLength = self.jr.getCurrentEntityAttributeList(entity)

        #print(attrListLength)
        if len(self.viewList) < attrListLength:
            listIndex = -1
            for view in self.viewList:
                slm = QStringListModel()
                listIndex+=1
                view.clear()
                view.addItems(self.jr.getAttributeItems(entity,listIndex))

            widgetColumnIndex=listIndex
            widgetRowIndex=0

            for i in range(attrListLength-len(self.viewList)):
                #print(slm)
                listIndex+=1
                widgetColumnIndex+=1
                
                if widgetColumnIndex > 4:
                    widgetRowIndex+=1
                    widgetColumnIndex=0
                print(r'widgetRowIndex:{},widgetColumnIndex:{}'.format(widgetRowIndex,widgetColumnIndex))
                tmpListView = QListWidget()
                tmpListView.addItems(self.jr.getAttributeItems(entity,listIndex))
                tmpListView.itemClicked.connect(self.UpdateAttr)
                # 按住CTRL可多选
                tmpListView.setSelectionMode(QAbstractItemView.ExtendedSelection)
                self.viewList.append(tmpListView)
                self.gridLayoutAttrListView.addWidget(tmpListView,widgetRowIndex,widgetColumnIndex)
        else:
            for index in range(attrListLength):
                self.viewList[index].clear()
                self.viewList[index].addItems(self.jr.getAttributeItems(entity,index))
                self.viewList[index].show()

            for index in range(attrListLength,len(self.viewList)):
                self.viewList[index].hide() 

    def UpdateAttr(self,qModelIndex):
        result=[]
        for view in self.viewList:
            tlist = view.selectedItems()
            text = [t.text() for t in list(tlist)]
            result+=text

        if self.checkBoxPlugeDict.isChecked():
            for pdview in self.PluDictViewList:
                tlist = pdview.selectedItems()
                text = [t.text() for t in list(tlist)]
                result+=text

        self.lineEditResult.setText('{'+','.join(result)+'}')

    def LoadCorpus(self):
        self.cr=CorpusReader(self.readCorpusFilePath)
        id,corpu=self.cr.getCorpu(1)
        self.UpdataCorpu(id,corpu)

    #上一页
    def PreviousPage(self):
        id=self.cr.getId()
        id,corpu=self.cr.getCorpu(id-1)
        self.UpdataCorpu(id,corpu)
        self.lineEditResult.setText(self.cr.getCorpuResult(id))

    #下一页
    def NextPage(self):
        id=self.cr.getId()
        id,corpu=self.cr.getCorpu(id+1)
        self.UpdataCorpu(id,corpu)
        self.lineEditResult.setText(self.cr.getCorpuResult(id))

    #页面跳转
    def JumpPage(self):
        id=int(self.lineEditPage.text())
        _,corpu=self.cr.getCorpu(id)
        self.UpdataCorpu(id,corpu)
        self.lineEditResult.setText(self.cr.getCorpuResult(id))

    def UpdataCorpu(self,id,corpu):
        self.lblNo.setText(str(id))
        self.lblSourceCorpu.setText(corpu)

    def SaveCorpus(self):
        self.cr.setSavePath(self.saveCorpusFilePath)

    #提交结果
    def Commit(self):
        id=self.cr.getId()
        corpu=self.lineEditResult.text()
        self.cr.Commit(id,corpu)
        self.cr.Save()
        '''将输入加载到历史输入框中'''
        if corpu not in self.historyInputList:
            self.historyInputList.append(corpu)
            self.historyInputListWidget.addItem(corpu)

    def PlugeDict(self):
        self.plugeDictjr=JsonReader(self.plugeDictionaryFilePath)
        self.plugeDictAttrList=self.plugeDictjr.getEntityList()
        widgetRowIndex=0
        widgetColumnIndex=-1
        for attrTitle in self.plugeDictAttrList:
            #print(self.plugeDictjr.getPlugeDictAttrItems(attrTitle))
            widgetColumnIndex+=1
                
            if widgetColumnIndex > 4:
                widgetRowIndex+=1
                widgetColumnIndex=0
            print(r'widgetRowIndex:{},widgetColumnIndex:{}'.format(widgetRowIndex,widgetColumnIndex))
            tmpListView = QListWidget()
            item=self.plugeDictjr.getPlugeDictAttrItems(attrTitle)
            self.PlugeDictionary.append(item)
            tmpListView.addItems(item)
            tmpListView.itemClicked.connect(self.UpdateAttr)
                # 按住CTRL可多选
            tmpListView.setSelectionMode(QAbstractItemView.ExtendedSelection)
            self.PluDictViewList.append(tmpListView)
            tmpListView.hide()
            self.gridLayoutPlugeDictListView.addWidget(tmpListView,widgetRowIndex,widgetColumnIndex)

    def closeEvent(self, event):
        """
        重写closeEvent方法,实现dialog窗体关闭时执行一些代码
        :param event: close()触发的事件
        :return: None
        """
        configItems=list()
        
        try:
            configItems.append('historyInputList&&'+"||".join(self.historyInputList))
            configItems.append('readCorpusFilePath&&'+self.readCorpusFilePath)
            configItems.append('attrFilePath&&'+self.attrFilePath)
            configItems.append('plugeDictionaryFilePath&&'+self.plugeDictionaryFilePath)
            configItems.append('saveCorpusFilePath&&'+self.saveCorpusFilePath)           
        except:
            print('存在设置没有设定')
        self.configFile.WriteConfig(configItems)

    def ReadConfig(self):
        self.configFile = ConfigFile()
        configItems=self.configFile.ReadConfig()

        for item in configItems:
            part=item.strip('\n').split('&&')
            if len(part)!=1:
                if part[0]=='attrFilePath':
                    self.attrFilePath=part[1]
                    self.LoadJson()
                elif part[0]=='plugeDictionaryFilePath':
                    self.plugeDictionaryFilePath=part[1]
                    self.PlugeDict()
                elif part[0]=='saveCorpusFilePath':
                    self.saveCorpusFilePath=part[1]
                    self.SaveCorpus()
                elif part[0]=='readCorpusFilePath':
                    self.readCorpusFilePath=part[1]
                    self.LoadCorpus()
                elif part[0]=='historyInputList':
                    self.historyInputList=part[1].split('||')
示例#10
0
文件: main.py 项目: qiuwei/snlp
    print total_num, error_num
    return float(total_num - error_num) / (total_num)

if __name__ == '__main__':
    brill_tagger = None
    mle_tagger = None
    try:
        fin = open('mletagger.model', 'rb')
        mle_tagger = load(fin)
        print "MLE tagger loaded"
        fin.close()
    # model doesn't exist
    except IOError:
        print "MLE model not found! Retraining..."
        print "Loading training corpus..."
        train_corpus = CorpusReader.readin('train.pos')
        print "Corpus loaded"
        print "Learning MLE tagger..."
        model = MLETagLearner.learn(train_corpus)
        mle_tagger = MLETagger(model)
        print "MLE tagger learned"
        fout = open('mletagger.model', 'wb')
        dump(mle_tagger, fout, -1)
        fout.close()
    
     
    try:
        fin = open('brilltagger.model', 'rb')
        brill_tagger = load(fin)
        print "Brill tagger loaded!"
        fin.close()
示例#11
0
    print('')
    print('-printing each lemma-')
    nlpPipeLine.createLemma(nlp, sentTest)

    print('')
    print('-printing each POS tag-')
    nlpPipeLine.createPOS(nlp, sentTest)

    print('')
    print('-printing all Dependency parse tree-')
    nlpPipeLine.createDepParse(nlp,sentTest)


    data_folder_train = Path("data/train-set.txt")
    trainCorpusObject = CorpusReader(data_folder_train)

    data_folder_test = Path("data/dev-set.txt")
    devCorpusObject = CorpusReader(data_folder_test)


    mlObject = MachineLearningTasks(trainCorpusObject, devCorpusObject)

    #do the nlp pipeline for each parah in corpusObject
    #store in the appropriate HashMap dict

    """a = 0
    for corpusParah in trainCorpusObject.corpus:
        #doc1 = nlp(corpusParah.hm1["sent"])
        #doc2 = nlp(corpusParah.hm2["sent"])
        if(a==2):
示例#12
0
from CorpusReader import CorpusReader
from levenshtein import levenshtein as lev
from collections import Counter 
from pdb import set_trace
from Header import Header

header = Header()
SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair', 'ead    ar', 'adar')

books = False
sentences = True

print "Loading Corpora..."
if books:
    print "\tloading munster"
    M = CorpusReader('munster')
    print "\tloading connacht"
    C = CorpusReader('connacht',M.countBooks())
    C.truncateBooks(M.countBooks())
    print "\tloading ulster"
    U = CorpusReader('ulster', M.countBooks())
    U.truncateBooks(M.countBooks())
    l = [U,M,C]
#print "Done."
if sentences:
    print "Creating Balanced Set of sentences"
    M = CorpusReader('munster')
    C = CorpusReader('connacht')
    U = CorpusReader('ulster')
    l = [U,M,C]
    MIN_LENG = min([x.countSentences() for x in l])