def produce_traintest(OrgFP,TestSpec,CheckAgainst=None): (WhereFrom,TestNum,PercentP)=TestSpec SentCnt=count_sentences(OrgFP) if PercentP: WhereFrom=int(SentCnt//(100/WhereFrom)) TestNum=int(SentCnt//(100/TestNum)) if CheckAgainst: SentsAlreadyInTest=open(CheckAgainst).read().strip().split('\n') FSwTest=open(myModule.get_stem_ext(OrgFP)[0]+'_test.mecab','wt') FSwTrain=open(myModule.get_stem_ext(OrgFP)[0]+'_train.mecab','wt') TestCntr=0 for Cntr,Sent in enumerate(extract_sentences(OrgFP)): #AlreadyInTestP=False if CheckAgainst: SentStr=''.join([ Line.split('\t')[0] for Line in Sent ]) if already_in_anothersentlist_p(SentStr,SentsAlreadyInTest): TestCntr+=1 continue if Cntr+1>=WhereFrom and TestCntr<TestNum: TestCntr+=1 FSwToWrite=FSwTest else: FSwToWrite=FSwTrain FSwToWrite.write('\n'.join(Sent)+'\nEOS\n') FSwTest.close() FSwTrain.close()
def generate_ftchunk(MecabDicFP, FtInds, Out=sys.stdout): SortedDicFP = myModule.get_stem_ext( MecabDicFP.replace('rawData', 'processedData'))[0] + '.sorted.csv' sort_mecabdic_fts(MecabDicFP, FtInds, OutFP=SortedDicFP) FSr = open(SortedDicFP) Lines = [] PrvRelvFts = None FstLoop = True for LiNe in FSr: if not FstLoop: Line = LiNe.strip() LineEls = Line.split(',') RelvFts = [LineEls[Ind] for Ind in FtInds] if PrvRelvFts != RelvFts: yield Lines Lines = [] else: Lines.append(Line) PrvRelvFts = RelvFts else: FstLoop = not FstLoop
def markedsents2outputs(MkdSents,OrgFP,StrictP=True,MoveTo=None): ErrorOutput=OrgFP+'.errors' ReducedOutput=myModule.get_stem_ext(OrgFP)[0]+'.reduced.mecab' FSwE=open(ErrorOutput,'wt') FSwR=open(ReducedOutput,'wt') ErrorCnt=0; LineCntr=0 for Cntr,MkdSent in enumerate(MkdSents): LineCntr+=len(MkdSent)+1 if not all(Line[-1]=='original' for Line in MkdSent): if StrictP or any(Line[-2] is None for Line in MkdSent): ErrorCnt+=1 FSwE.write(str(Cntr+1)+'; '+str(LineCntr)+'\n'+'\n'.join([ MkdLine[0]+'\t'+MkdLine[-1] for MkdLine in MkdSent])+'\n') else: FSwR.write(markedsent2output(MkdSent)) else: MkdSentM=markedsent2output(MkdSent) FSwR.write(MkdSentM) FSwE.close() FSwR.close() if ErrorCnt==0: os.remove(ReducedOutput) os.remove(ErrorOutput) print('No error found for file '+OrgFP) time.sleep(2) return True else: print(str(ErrorCnt)+' error(s) found for file '+OrgFP) if not MoveTo: MoveTo=os.getcwd subprocess.call(['cp',OrgFP,MoveTo]) subprocess.call(['cp',ErrorOutput,MoveTo]) os.remove(OrgFP) os.remove(ErrorOutput) print('Original file moved to '+MoveTo) time.sleep(2) return False
def main0(MecabFP, CorpusOrDic='dic', OutFP=None, Debug=0, Fts=None, UnkAbsFtCnt=2, StrictP=False, OrgReduced=True): NewWds = set() if OutFP is True: Stem, Ext = myModule.get_stem_ext(MecabFP) Out = open(Stem + '.compressed.' + Ext, 'wt') elif OutFP is None or OutFP is False: Out = sys.stdout else: Out = open(OutFP + '.tmp', 'wt') if OrgReduced: OrgReducedFSw = open(OutFP + '.orgreduced', 'wt') ChunkGen = generate_chunks(MecabFP, CorpusOrDic) print('\nCompressing ' + MecabFP + '\n') ErrorStrs = [] for Cntr, SentChunk in enumerate(ChunkGen): if not SentChunk: if Debug: sys.stderr.write('\nsent ' + str(Cntr + 1) + ' is empty\n') continue if Debug: sys.stderr.write( '\nsent ' + str(Cntr + 1) + ' ' + ''.join([Sent.split('\t')[0] for Sent in SentChunk]) + '\n') SuccessP, NewLines = lemmatise_mecabchunk(SentChunk, CorpusOrDic, NewWds, OutFP, Debug=Debug, Fts=Fts, UnkAbsFtCnt=UnkAbsFtCnt) if SuccessP: Out.write('\n'.join(NewLines + ['EOS']) + '\n') if OrgReduced: OrgReducedFSw.write('EOS\n'.join(SentChunk) + '\n') else: if StrictP: lemmatise_mecabchunk(SentChunk, CorpusOrDic, NewWds, OutFP, Debug=2, Fts=Fts) else: FailedNth = len(NewLines) if len(NewLines) == 1: MiddlePhr = '(the first word failed)' else: MiddlePhr = '(starting with the word ' + NewLines[0].split( )[0] + ')' ErrorStr = 'Sentence ' + str( Cntr + 1) + ' ' + MiddlePhr + ' failed on its ' + str( FailedNth) + 'th line:\n' + NewLines[-1].get_mecabline( ) sys.stderr.write('\n' + ErrorStr + '\n') ErrorStrs.append(ErrorStr) lemmatise_mecabchunk(SentChunk, CorpusOrDic, NewWds, OutFP, Debug=2, Fts=Fts) print('\ncompression for ' + MecabFP + ' ended') if OutFP: Out.close() os.rename(OutFP + '.tmp', OutFP) print(' Output file: ' + OutFP + '') if ErrorStrs: ErrorFP = OutFP + '.errors' print(' Error(s) found, error count ' + str(len(ErrorStrs)) + ' out of ' + str(Cntr + 1) + ' sentences. For details see ' + ErrorFP + '\n') time.sleep(2) ErrorOut = open(ErrorFP, 'wt') ErrorOut.write('\n'.join(ErrorStrs)) else: print(' No errors, congrats!\n')