예제 #1
0
def main():
    filename = './origin_data/bugreports.xml'
    path = './bug_reports'
    bugslist = utils.read_xml(filename)
    # print(bugslist)
    label = utils.read_label('./origin_data/goldset.txt')
    # print(label)
    samples, ids = utils.get_content(bugslist)
    # print(samples)
    num_word_list, numword = utils.count_word(samples)
    # print(len(num_word_list))

    # for i in num_word_list:
    #     num_sentence.append(len(i))
    utils.savefile(samples)
    # print(num_sentence)
    results = textrank.bugsum(path, numword, num_word_list)
    print(len(i) for i in results)
    # extra_ids = index2id(results,ids)
    # print(len(extra_ids))
    pred = eval.index2pred(results, ids)
    y = eval.label2y(label, ids)
    mean_acc, mean_pr, mean_re, mean_f1 = eval.evaluate(y, pred)
    print('mean_acc, mean_pr, mean_re, mean_f1', mean_acc, mean_pr, mean_re,
          mean_f1)
예제 #2
0
 def on_modified(self, event):
     modified_content = file_reader(event.src_path)
     count = count_word(modified_content, KEYWORD_TO_COUNT)
     log_writer = Logger()
     log_writer.write_logs(
         f'In file: {event.src_path} Keyword count of {KEYWORD_TO_COUNT} is {count}'
     )
     print('%s File modified  CDS count: %s' % (event.src_path, count))
예제 #3
0
def clean_data():

    start_time = time.time()

    processer = dataProcessor.DataProcessor()

    data = pd.read_csv(data_path, header=0)

    if mode.lower() == 'clean':

        data['word_count'] = utils.count_word(data.text)

        data['count_number'] = utils.count_numbers(data.text)

        data['emojies'] = utils.view_emojie(data.text)

        data['emoticons'] = utils.view_emoticon(data.text)

        data['len_tweet'] = utils.len_tweet(data.text)

        data['avg_words_len'] = utils.avg_word_len(data.text)

        data['count_stopwords'] = utils.count_stopwords(data.text)

        data['count_tagging'] = utils.count_tagging(data.text)

        data['flagged'] = utils.repeated_char(data.text)

        # data_copy.append([word_count, count_number, emojies, len_tweet, avg_words_len, count_stopwords, count_tagging], ignore_index=True)
        data.to_csv(filename + '.csv', index=False)

        tf = utils.term_freq(data.text)

        tf.to_csv('term_frequency.csv', index=False)

        data_pro, _ = processer.proccess_data(data.text,
                                              handle_emojies=handle_emojies)

        data_pro = pd.DataFrame(data_pro, columns=['text'])
        data_pro.append(data['label'])
        data_pro.to_csv('cleaned.csv', index=False)

        elapsed_time = time.time() - start_time
        print(f'Finished in {elapsed_time}')

    return None
예제 #4
0
    def text_summary(self):
        """Composes a textual summary of the event."""
        
        msg = "The build event was started on %s." % self.text_timestamp()

        msg += ' It'

        pkgCount = len(self.list_package_files())

        changesName = self.file_path('changes.html')
        commitCount = 0
        if os.path.exists(changesName):
            commitCount = utils.count_word('<li>', file(changesName).read())
        if commitCount:
            moreThan = ''
            if commitCount == 100: moreThan = 'more than '
            msg += " contains %s%i commits and" % (moreThan, commitCount)

        msg += " produced %i installable binary package%s." % \
            (pkgCount, 's' if (pkgCount != 1) else '')

        return msg
예제 #5
0
    def text_summary(self):
        """Composes a textual summary of the event."""

        msg = "The build event was started on %s." % self.text_timestamp()

        msg += ' It'

        pkgCount = len(self.list_package_files())

        changesName = self.file_path('changes.html')
        commitCount = 0
        if os.path.exists(changesName):
            commitCount = utils.count_word('<li>', file(changesName).read())
        if commitCount:
            moreThan = ''
            if commitCount == 100: moreThan = 'more than '
            msg += " contains %s%i commits and" % (moreThan, commitCount)

        msg += " produced %i package%s." % \
            (pkgCount, 's' if (pkgCount != 1) else '')

        return msg
예제 #6
0
    def html_description(self, encoded=True):
        """Composes an HTML build report."""

        name = self.name
        buildDir = self.buildDir
        oses = self.oses

        msg = '<p>' + self.text_summary() + '</p>'

        # What do we have here?
        files = self.list_package_files()

        # Print out the matrix.
        msg += '<p><table cellspacing="4" border="0">'
        msg += '<tr style="text-align:left;"><th>OS<th>Binary<th>Logs<th>Issues</tr>'

        for osName, osExt, osIdent in oses:
            isFirst = True
            # Find the binaries for this OS.
            binaries = []
            for f in files:
                if self.os_from_filename(f)[2] == osIdent:
                    binaries.append(f)

            if not binaries:
                # Nothing available for this OS.
                msg += '<tr><td>' + osName + '<td>n/a'
                
                # Do we have a log?
                logName = log_filename(self.packages[0], osIdent)
                if os.path.exists(self.file_path(logName)):
                    msg += self.html_table_log_issues(logName)
                
                msg += '</tr>'
                continue

            # List all the binaries. One row per binary.
            for binary in self.sort_by_package(binaries):
                msg += '<tr><td>'
                if isFirst:
                    msg += osName
                    isFirst = False
                msg += '<td>'
                msg += '<a href="%s">%s</a>' % (self.download_uri(binary), binary)

                # Status of the log.
                logName = self.compressed_log_filename(binary)
                if not os.path.exists(self.file_path(logName)):
                    msg += '</tr>'
                    continue                            

                # Link to the compressed log.
                msg += self.html_table_log_issues(logName)

            msg += '</tr>'

        msg += '</table></p>'

        # Changes.
        chgFn = self.file_path('changes.html')
        if os.path.exists(chgFn):
            if utils.count_word('<li>', file(chgFn).read()):
                msg += '<h2>Commits</h2>' + file(chgFn, 'rt').read()

        # Enclose it in a CDATA block if needed.
        if encoded: return '<![CDATA[' + msg + ']]>'    
        return msg
예제 #7
0
    def html_description(self, encoded=True):
        """Composes an HTML build report."""

        name = self.name
        buildDir = self.buildDir
        oses = self.oses

        msg = '<p>' + self.text_summary() + '</p>'

        # What do we have here?
        files = self.list_package_files()

        # Print out the matrix.
        msg += '<p><table cellspacing="4" border="0">'
        msg += '<tr style="text-align:left;"><th>OS<th>Binary<th>Logs<th>Issues</tr>'

        for osName, osExt, osIdent in oses:
            isFirst = True
            # Find the binaries for this OS.
            binaries = []
            for f in files:
                if self.os_from_filename(f)[2] == osIdent:
                    binaries.append(f)

            if not binaries:
                # Nothing available for this OS.
                msg += '<tr><td>' + osName + '<td>n/a'

                # Do we have a log?
                logName = log_filename(self.packages[0], osIdent)
                if os.path.exists(self.file_path(logName)):
                    msg += self.html_table_log_issues(logName)

                msg += '</tr>'
                continue

            # List all the binaries. One row per binary.
            for binary in self.sort_by_package(binaries):
                msg += '<tr><td>'
                if isFirst:
                    msg += osName
                    isFirst = False
                msg += '<td>'
                msg += '<a href="%s">%s</a>' % (self.download_uri(binary),
                                                binary)

                # Status of the log.
                logName = self.compressed_log_filename(binary)
                if not os.path.exists(self.file_path(logName)):
                    msg += '</tr>'
                    continue

                # Link to the compressed log.
                msg += self.html_table_log_issues(logName)

            msg += '</tr>'

        msg += '</table></p>'

        # Changes.
        chgFn = self.file_path('changes.html')
        if os.path.exists(chgFn):
            if utils.count_word('<li>', file(chgFn).read()):
                msg += '<h2>Commits</h2>' + file(chgFn, 'rt').read()

        # Enclose it in a CDATA block if needed.
        if encoded: return '<![CDATA[' + msg + ']]>'
        return msg
test = collection.find().next()

articles_wordcount = {}
i = 0

for article in collection.find({
        'type': 'article',
        'creationDate': {
            '$gte': datetime(2019, 1, 1)
        }
}):
    i += 1
    print(i)
    wordcount = 0
    title_wordcount = 0
    for chapter in article['title'].values():
        title_wordcount += count_word(chapter)
    for chapter in article['chapters']:
        if chapter['type'] == 'paragraph':
            text = [text for text in chapter['text'].values()]
            for t in text:
                wordcount += count_word(t)
    articles_wordcount[article['id']] = wordcount

with open("Articles_wordcount.json", 'w') as f:
    json.dump(articles_wordcount, f)
    f.close()
with open("title_wordcount.json", 'w') as f:
    json.dump(articles_wordcount, f)
    f.close()