示例#1
0
def log_short_repetitions(maxtokens, minrepetitions):
    import semanticfilter

    numf = lambda num: (
        ("%d" if type(num) is int else "%0.20f") % num).replace(',', '.')
    numl = lambda vl: list(map(numf, list(vl)))

    with open(os.path.join("Output", subdir, "shortterms.csv"),
              'w',
              encoding='utf-8') as csvfile:
        fwtr = csv.writer(csvfile,
                          delimiter=';',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL,
                          lineterminator='\n')

        fwtr.writerow(['Total groups:', numf(len(clones.clonegroups))])
        fwtr.writerow([
            'N tokens', 'Occurs times', 'Common Phrase', 'Words', 'Plain Text',
            'Text'
        ])

        for cg in clones.clonegroups:
            if cg.ntokens <= maxtokens and len(
                    cg.instances) >= minrepetitions and not cg.containsNoWords(
                    ):
                words = ' '.join(
                    semanticfilter.cleanwords(cg.plain_text(), True))
                fwtr.writerow([
                    numf(cg.ntokens),
                    numf(len(cg.instances)),
                    numf(int(cg.containsNoSemantic())), words,
                    cg.plain_text().replace('\r', '').replace('\n', ' '),
                    cg.text().strip().replace('\r', '').replace('\n', ' ')
                ])
示例#2
0
 def containsNoWords(self):
     if self.containsNoText():
         return True
     elif len(semanticfilter.cleanwords(self.plain_text())) == 0:
         return True
     else:
         return False
示例#3
0
def log_short_repetitions(maxtokens, minrepetitions):
    import semanticfilter

    numf = lambda num: (("%d" if type(num) is int else "%0.20f") % num).replace(',', '.')
    numl = lambda vl: list(map(numf, list(vl)))

    with open(os.path.join("Output", subdir, "shortterms.csv"), 'w') as csvfile:
        fwtr = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')

        fwtr.writerow(['Total groups:', numf(len(clones.clonegroups))])
        fwtr.writerow(['N tokens', 'Occurs times', 'Common Phrase', 'Words', 'Plain Text', 'Text'])

        for cg in clones.clonegroups:
            if cg.ntokens <= maxtokens and len(cg.instances) >= minrepetitions and not cg.containsNoWords():
                words = ' '.join(semanticfilter.cleanwords(cg.plain_text(), True))
                fwtr.writerow([
                    numf(cg.ntokens),
                    numf(len(cg.instances)),
                    numf(int(cg.containsNoSemantic())),
                    words,
                    cg.plain_text().replace('\r', '').replace('\n', ' '),
                    cg.text().strip().replace('\r', '').replace('\n', ' ')
                ])