def log_short_repetitions(maxtokens, minrepetitions): import semanticfilter numf = lambda num: ( ("%d" if type(num) is int else "%0.20f") % num).replace(',', '.') numl = lambda vl: list(map(numf, list(vl))) with open(os.path.join("Output", subdir, "shortterms.csv"), 'w', encoding='utf-8') as csvfile: fwtr = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') fwtr.writerow(['Total groups:', numf(len(clones.clonegroups))]) fwtr.writerow([ 'N tokens', 'Occurs times', 'Common Phrase', 'Words', 'Plain Text', 'Text' ]) for cg in clones.clonegroups: if cg.ntokens <= maxtokens and len( cg.instances) >= minrepetitions and not cg.containsNoWords( ): words = ' '.join( semanticfilter.cleanwords(cg.plain_text(), True)) fwtr.writerow([ numf(cg.ntokens), numf(len(cg.instances)), numf(int(cg.containsNoSemantic())), words, cg.plain_text().replace('\r', '').replace('\n', ' '), cg.text().strip().replace('\r', '').replace('\n', ' ') ])
def containsNoWords(self): if self.containsNoText(): return True elif len(semanticfilter.cleanwords(self.plain_text())) == 0: return True else: return False
def log_short_repetitions(maxtokens, minrepetitions): import semanticfilter numf = lambda num: (("%d" if type(num) is int else "%0.20f") % num).replace(',', '.') numl = lambda vl: list(map(numf, list(vl))) with open(os.path.join("Output", subdir, "shortterms.csv"), 'w') as csvfile: fwtr = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') fwtr.writerow(['Total groups:', numf(len(clones.clonegroups))]) fwtr.writerow(['N tokens', 'Occurs times', 'Common Phrase', 'Words', 'Plain Text', 'Text']) for cg in clones.clonegroups: if cg.ntokens <= maxtokens and len(cg.instances) >= minrepetitions and not cg.containsNoWords(): words = ' '.join(semanticfilter.cleanwords(cg.plain_text(), True)) fwtr.writerow([ numf(cg.ntokens), numf(len(cg.instances)), numf(int(cg.containsNoSemantic())), words, cg.plain_text().replace('\r', '').replace('\n', ' '), cg.text().strip().replace('\r', '').replace('\n', ' ') ])