Exemplo n.º 1
0
    def build_alt_words_table(n):

        Logger.log_message('Building alternate words table based on ' + str(n) + '-grams')
        alt_words = {}

        in_file = path.join('out', str(n) + '-gram-regexp.csv')
        Logger.log_message('Reading ' + in_file)

        input_file = open(in_file)

        for line in input_file.readlines():
            words = str(line.split(';')[1]).split()
            for word in words:
                for alt_word in words:
                    if word in alt_words:
                        if alt_word not in alt_words[word]:
                            alt_words[word].append(alt_word)
                    else:
                        alt_words[word] = [alt_word]

        input_file.close()
        Logger.log_success('Finished reading ' + in_file)

        out_file = path.join('out', 'lwlm-alt-words-' + str(n) + '-grams.csv')
        Logger.log_message('Writing alternate words table to ' + out_file)
        output_file = open(out_file, 'w+')

        for word in alt_words:
            words = set(alt_words[word])
            col = ' '.join(w for w in words)
            output_file.write(word + ';' + col + '\n')

        output_file.close()
        Logger.log_success('Alternate words table has been written to ' + out_file)
Exemplo n.º 2
0
    def run(self):

        Logger.log_message("Running ITFIDF")

        Logger.log_message("Reading " + self.tf_dict)
        tf_dict_file = open(self.tf_dict)

        for line in tf_dict_file.readlines():
            cols = line.split(";")
            self.tf_dictionary[cols[0]] = int(cols[1])

        tf_dict_file.close()

        Logger.log_message("Reading " + self.df_dict)
        df_dict_file = open(self.df_dict)

        for line in df_dict_file.readlines():
            cols = line.split(";")
            self.df_dictionary[cols[0]] = int(cols[1])

        max_tf = max(self.tf_dictionary.values())
        max_df = max(self.df_dictionary.values())

        for word in self.df_dictionary:
            if word in self.tf_dictionary:
                if word in self.df_dictionary:
                    self.itfidf[word] = (max_tf * max_df) / (self.tf_dictionary[word] * self.df_dictionary[word])

        Logger.log_message("Writing results to " + self.out_file)
        self.dump_results()
        Logger.log_success("Finished writing results to " + self.out_file)
    def __init__(self, stats_file, out_file):
        Logger.log_message("Initializing Readability Calculator")

        self.stats_file = stats_file
        self.out_file = out_file

        self.readability_grade = {}
Exemplo n.º 4
0
    def parse(self, in_file):

        if not in_file.endswith(".txt"):
            return

        Logger.log_message("Parsing file " + in_file)

        input_file = open(in_file)
        content = input_file.read()

        words = content.split()

        self.number_of_words[in_file] = 0
        self.number_of_sentences[in_file] = 0
        self.number_of_syllables[in_file] = 0
        self.number_of_chars[in_file] = 0

        for word in words:

            # Check if there are any separators
            for separator in self.separators:
                if separator in word:
                    self.number_of_sentences[in_file] += 1

            sanitized_word = Parser.sanitize_word(word)

            if sanitized_word == "":
                continue

            self.number_of_words[in_file] += 1
            self.number_of_chars[in_file] += len(sanitized_word)
            self.number_of_syllables[in_file] += \
                SyllableCounter.count_syllables(sanitized_word)

        input_file.close()
Exemplo n.º 5
0
def main():

    if len(sys.argv) > 1:
        if sys.argv[1] == 'server':
            run_server()
            return

    Logger.log_message("Running application Simplify")
    cleanup()
    LWLM.build_tables('corpus')
    Logger.log_success("Application exited successfully")
Exemplo n.º 6
0
    def __init__(self, in_file, out_file):
        Logger.log_message("Initializing merger")

        self.in_file = in_file
        self.out_file = out_file

        self.file_names = "abcdefghijklmnopqrstuvwxyz"

        self.number_of_chars = {}
        self.number_of_words = {}
        self.number_of_sentences = {}
        self.number_of_syllables = {}
Exemplo n.º 7
0
    def sanitize(in_file, out_file):

        Logger.log_message('Sanitizing ' + in_file)

        input_file = open(in_file, 'r')
        output_file = open(out_file, 'w')

        for input_line in input_file.readlines():
            output_line = ' '.join([word.lower() for word in split('\W', input_line) if word])
            output_file.write(output_line + '\n')

        input_file.close()
        output_file.close()
Exemplo n.º 8
0
    def stem_file(in_file, out_file):

        Logger.log_message('Stemming ' + in_file)

        input_file = open(in_file, 'r')
        output_file = open(out_file, 'w')

        for line in input_file.readlines():
            output_line = ' '.join([PorterStemmer().stem_word(word) for word in line.split()])
            output_file.write(output_line + '\n')

        input_file.close()
        output_file.close()
Exemplo n.º 9
0
    def __init__(self, in_dir, out_dir, out_file):
        Logger.log_message("Initializing parser")

        self.in_dir = in_dir
        self.out_file = path.join(out_dir, out_file)
        self.out_dir = out_dir

        self.separators = [".", "!", "?"]

        self.number_of_words = {}
        self.number_of_sentences = {}
        self.number_of_syllables = {}
        self.number_of_chars = {}
    def run(self):
        Logger.log_message("Running Readability Calculator on " +
                           self.stats_file)

        input_file = open(self.stats_file)

        for line in input_file.readlines():

            if line[0] == "#":
                continue

            cols = line.split(";")

            grade = int(cols[0])
            words = float(cols[2])
            syllables = float(cols[3])
            sentences = float(cols[4])

            flesch_kincaid_grade = 0.39 * (words / sentences) + \
                11.8 * (syllables / words) - 15.59

            self.readability_grade[grade] = flesch_kincaid_grade

        input_file.close()

        Logger.log_message("Writing results to " + self.out_file)
        self.dump_results()
        Logger.log_success("Results written to " + self.out_file)

        grades = self.readability_grade.keys()
        fk_grades = self.readability_grade.values()

        correlation = pearsonr(grades, fk_grades)
        Logger.log_result("Correlation between grade level and Flesch Kincaid grade: " \
            + str(correlation))
Exemplo n.º 11
0
    def parse_file(self, in_file):
        Logger.log_message('Running ' + str(self.n) + '-Gram Frequency counter on ' + in_file)

        input_file = open(in_file)

        # Create a sanitized content string
        content = ""

        for line in input_file.readlines():
            words = line.split()
            for word in words:
                content += Sanitizer.sanitize_word(word) + " "

        content = content.split()

        length = len(content)

        # Parse the content
        for i in range(length - self.n + 1):
            s = ""
            neighbor_str = ""
            word = ""

            for j in range(self.n):
                s += content[i + j]

                if j == self.n / 2:
                    neighbor_str += '*'
                    word = content[i + j]
                else:
                    neighbor_str += content[i + j]
                    pass

                if j != self.n - 1:
                    s += '|'
                    neighbor_str += '|'

            if s in self.table:
                self.table[s] += 1
            else:
                self.table[s] = 1

            if neighbor_str in self.neighbors:
                self.neighbors[neighbor_str].append(word)
            else:
                self.neighbors[neighbor_str] = [word]

        input_file.close()
Exemplo n.º 12
0
    def count_kf_frequency(self, in_file):
        Logger.log_message('Counting Kucera Francis frequency for ' + in_file)

        input_file = open(in_file, 'r')

        for line in input_file.readlines():
            for word in line.split():

                if word.isdigit():
                    continue

                if word in self.kf_val:
                    # If word is present in the psycholinguistic dictionary
                    self.kf_res[word] = self.kf_val[word]
                else:
                    self.kf_res[word] = 0
Exemplo n.º 13
0
    def count_syllables(self, in_file):
        Logger.log_message('Counting number of syllables for ' + in_file)

        input_file = open(in_file, 'r')

        for line in input_file.readlines():
            for word in line.split():

                if word.isdigit():
                    continue

                if word in self.syllable_val:
                    # If word is present in psycholinguistic dictionary
                    self.syllable_res[word] = self.syllable_val[word]
                else:
                    self.syllable_res[word] = 0
Exemplo n.º 14
0
    def count_term_frequency(self, in_file):

        Logger.log_message('Counting term frequency for ' + in_file)

        input_file = open(in_file, 'r')

        for line in input_file.readlines():
            for word in line.split():

                if word.isdigit():
                    continue

                if self.frequencies.has_key(word):
                    self.frequencies[word] += 1
                else:
                    self.frequencies[word] = 1

        input_file.close()
Exemplo n.º 15
0
    def build_tables(in_dir):
        Logger.log_message('Building 3-Gram LWLM tables')
        ng = NGram(3, in_dir)
        ng.run()
        Logger.log_success('Finished building 3-Gram LWLM tables')

        Logger.log_message('Building 5-Gram LWLM tables')
        ng = NGram(5, in_dir)
        ng.run()
        Logger.log_success('Finished building 5-Gram LWLM tables')

        LWLM.build_alt_words_table(3)
        LWLM.build_alt_words_table(5)
Exemplo n.º 16
0
def main():
    Logger.log_message("Starting NCERT Readability application")

    # Run the parser
    parser = Parser(CORPUS_DIR, OUTPUT_DIR, STATS_FILE)
    parser.run()

    # Merge the stats
    merger = Merger(path.join(OUTPUT_DIR, STATS_FILE),
                    path.join(OUTPUT_DIR, MERGED_STATS_FILE))

    merger.run()

    readability_calc = ReadabilityCalculator(
        path.join(OUTPUT_DIR, MERGED_STATS_FILE),
        path.join(OUTPUT_DIR, RESULTS_FILE))

    readability_calc.run()

    Logger.log_success("Application exited successfully")
Exemplo n.º 17
0
    def run(self):
        # Check for the input directory
        try:
            stat(self.in_dir)
        except:
            Logger.log_error('Input text not found')
            return

        # Create the output directory
        try:
            stat(self.out_dir)
        except:
            makedirs(self.out_dir)

        Logger.log_message('Started stemming')

        # Walk through the input directory
        for(dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:

                in_file = path.join(dir_path, file_name)
                out_file = path.join(self.out_dir, file_name + '_' + dir_path.replace('/', '_') + '.txt')

                Stemmer.stem_file(in_file, out_file)

        Logger.log_success('Finished stemming')
Exemplo n.º 18
0
    def run(self):
        Logger.log_message("Running merger on " + self.in_file)

        input_file = open(self.in_file)

        for line in input_file.readlines():

            if line[0] == "#":
                continue

            cols = line.split(";")
            file_name = str(cols[0].split("/")[-1])

            grade = self.file_names.index(file_name[0]) + 1

            if grade not in self.number_of_words:
                self.number_of_chars[grade] = 0
                self.number_of_words[grade] = 0
                self.number_of_syllables[grade] = 0
                self.number_of_sentences[grade] = 0

            self.number_of_chars[grade] += int(cols[1])
            self.number_of_words[grade] += int(cols[2])
            self.number_of_syllables[grade] += int(cols[3])
            self.number_of_sentences[grade] += int(cols[4])

        input_file.close()

        Logger.log_message("Writing merged results to " + self.out_file)
        self.dump_results()
        Logger.log_success("Results written to " + self.out_file)
Exemplo n.º 19
0
    def run(self):

        # Check for the input directory
        try:
            stat(self.in_dir)
        except:
            Logger.log_error('Input text not found')
            return

        Logger.log_message('Running term frequency counter')

        for(dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:

                in_file = path.join(dir_path, file_name)
                self.count_term_frequency(in_file)

        Logger.log_success('Finished term frequency counting')

        Logger.log_message('Writing results to ' + self.out_file)
        self.dump_results()
        Logger.log_success('Finished writing results to ' + self.out_file)
Exemplo n.º 20
0
    def terms_in_document(self, in_file):

        Logger.log_message('Running document frequency counter for ' + in_file)

        input_file = open(in_file)

        # Reset words in document every time
        for word in self.document_words:
            self.document_words[word] = False

        # Set the words present in document to True
        for line in input_file.readlines():
            for word in line.split():
                self.document_words[word] = True

        for word in self.document_words:
            # If word present in document
            if self.document_words[word]:
                if word in self.document_frequencies:
                    self.document_frequencies[word] += 1
                else:
                    self.document_frequencies[word] = 1
Exemplo n.º 21
0
    def __init__(self, host, port, debug):

        self.host = host
        self.port = port
        self.debug = debug

        self.app = Flask(__name__)

        # Index route
        @self.app.route('/')
        def index():
            return render_template('index.html')

        @self.app.route('/api/tag')
        def tag_api():
            text = request.args["text"]
            _type = request.args["type"]

            tagger = Tagger(_type)
            result = tagger.tag(text)

            return jsonify(success=True, result=result)

        Logger.log_success('Server started successfully')
Exemplo n.º 22
0
def run_server():

    try:
        stat('out')
    except:
        Logger.log_error('Data tables not built yet')
        Logger.log_message('Please run ./run first')
        return

    Logger.log_message('Running application server')
    web_app = WebApp('localhost', 8000, True)
    web_app.run()
Exemplo n.º 23
0
    def dump_results(self):

        out_file = path.join('out', str(self.n) + '-gram.csv')
        Logger.log_message('Writing ' + str(self.n) + '-Gram table to ' + out_file)
        output_file = open(out_file, 'w+')

        for s in self.table:
            output_file.write(s + ';' + str(self.table[s]) + '\n')

        output_file.close()
        Logger.log_success('Finished writing ' + str(self.n) + '-Gram table to ' + out_file)

        out_file = path.join('out', str(self.n) + '-gram-regexp.csv')
        Logger.log_message('Writing ' + str(self.n) + '-Gram Regular Expressions to ' + out_file)
        output_file = open(out_file, 'w+')

        for nb in self.neighbors:
            words = set(self.neighbors[nb])
            col = ' '.join(w for w in words)
            output_file.write(nb + ';' + col + '\n')

        output_file.close()
        Logger.log_success(str(self.n) + '-Gram Regular Expressions have been written to ' + out_file)
Exemplo n.º 24
0
    def run(self):
        Logger.log_message('Running ' + str(self.n) + '-Gram Frequency counter')

        try:
            stat(self.in_dir)
        except:
            Logger.log_error('Input text not found')
            return

        for(dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:
                in_file = path.join(dir_path, file_name)
                self.parse_file(in_file)

        self.dump_results()

        Logger.log_success(str(self.n) + '-Gram Frequency counter exited successfully')
Exemplo n.º 25
0
    def run(self):
        Logger.log_message("Parser started running")

        # Check if the input directory exists
        try:
            stat(self.in_dir)
        except:
            Logger.log_error("Input text not found")
            return

        # Create output directory if it doesn't exist
        try:
            stat(self.out_dir)
        except:
            mkdir(self.out_dir)

        for (dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:
                in_file = path.join(dir_path, file_name)
                self.parse(in_file)

        Logger.log_message("Writing results to " + self.out_file)
        self.dump_results()
        Logger.log_success("Results have been written to " + self.out_file)
Exemplo n.º 26
0
    def run(self):

        # Build up the syllable count dictionary
        dictionary = open(self.dict_file, 'r')

        Logger.log_message("Reading " + self.dict_file)

        for line in dictionary.readlines():
            cols = line.split(';')
            self.syllable_val[cols[0]] = int(cols[2])

        dictionary.close()

        # Check for the input directory
        try:
            stat(self.in_dir)
        except:
            Logger.log_error('Input text not found')
            return

        Logger.log_message('Running syllable counter')

        for(dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:
                in_file = path.join(dir_path, file_name)
                self.count_syllables(in_file)

        Logger.log_success('Finished syllable counting')

        Logger.log_message('Writing results to ' + self.out_file)
        self.dump_results()
        Logger.log_success('Finished writing results to ' + self.out_file)
Exemplo n.º 27
0
def main():

    args = sys.argv

    usage = '''
            ./run txtdump\t<Gives the text dump of corpus>
            ./run sanitize\t<Sanitize the text dump to remove white spaces, etc.>
            ./run stem\t\t<Stem the sanitized text>
            ./run tf\t\t<Calculate the raw term frequency>
            ./run df\t\t<Calculate the document frequency>
            ./run itfidf\t<Calculate the inverse term frequency - inverse document frequency>
            ./run dict\t\t<Create the psycholinguistic dictionary>
            ./run kff\t\t<Calculate the Kucera Francis frequency>
            ./run syl\t\t<Calculate the number of syllables>
            ./run server\t<Run the application server>
            '''

    if len(args) < 2:

        Logger.log_usage(usage)
        return

    if args[1] == 'server':
        web_app = WebApp('127.0.0.1', 5000, DEBUG)
        web_app.run()
        return

    elif args[1] == 'txtdump':
        txt_dump = TxtDump('corpus', path.join('tmp', 'txtdump'))
        txt_dump.run()
        return

    elif args[1] == 'sanitize':
        sanitizer = Sanitizer(path.join('tmp', 'txtdump'), path.join('tmp', 'sanitized'))
        sanitizer.run()
        return

    elif args[1] == 'stem':
        stemmer = Stemmer(path.join('tmp', 'sanitized'), path.join('tmp', 'stemmed'))
        stemmer.run()
        return

    elif args[1] == 'tf':
        tf = TermFrequency(path.join('tmp', 'stemmed'), path.join('data', 'tf_stemmed.csv'))
        tf.run()

        tf = TermFrequency(path.join('tmp', 'sanitized'), path.join('data', 'terms_list.csv'))
        tf.run()

        return

    elif args[1] == 'df':
        df = DocumentFrequency(path.join('tmp', 'stemmed'), path.join('data', 'df_stemmed.csv'),
                               path.join('data', 'tf_stemmed.csv'))
        df.run()
        return

    elif args[1] == 'itfidf':
        itfidf = ITFIDF(path.join('data', 'itfidf_stemmed.csv'),
                        path.join('data', 'tf_stemmed.csv'),
                        path.join('data', 'df_stemmed.csv'))
        itfidf.run()
        return

    elif args[1] == 'dict':
        dict_creator = PsycholinguisticDbCreator(path.join('data', 'psycholinguistic_db'),
                                                 path.join('data', 'psycholinguistic_db.csv'))
        dict_creator.create()
        return

    elif args[1] == 'kff':
        kf_freq_counter = KFFrequency(path.join('tmp', 'stemmed'),
                                      path.join('data', 'kff_stemmed.csv'),
                                      path.join('data', 'psycholinguistic_db.csv'))
        kf_freq_counter.run()
        return

    elif args[1] == 'syl':
        syllable_counter = SyllableCounter(path.join('tmp', 'stemmed'),
                                           path.join('data', 'syllables_stemmed.csv'),
                                           path.join('data', 'psycholinguistic_db.csv'))
        syllable_counter.run()
        return

    else:
        Logger.log_usage(usage)
        return
Exemplo n.º 28
0
def cleanup():
    Logger.log_message('Cleaning up')
    call(['rm', '-rf', 'out'])
    call(['mkdir', 'out'])
    Logger.log_success('Finished cleaning up')
Exemplo n.º 29
0
    def run(self):

        dictionary = open(self.dict_file)

        Logger.log_message('Reading ' + self.dict_file)

        # Construct the dictionary
        for line in dictionary.readlines():
            items = line.split(";")
            self.document_frequencies[items[0]] = 0
            self.document_words[items[0]] = False

        # Check for the input directory
        try:
            stat(self.in_dir)
        except:
            Logger.log_error('Input text not found')
            return

        Logger.log_message('Running document frequency counter')

        for(dir_path, _, file_names) in walk(self.in_dir):
            for file_name in file_names:
                in_file = path.join(dir_path, file_name)

                self.terms_in_document(in_file)

        Logger.log_message('Finished document frequency counting')

        Logger.log_message('Writing results to ' + self.out_file)
        self.dump_results()
        Logger.log_success('Finished writing results to ' + self.out_file)
Exemplo n.º 30
0
    def __init__(self, host, port, debug):
        self.host = host
        self.port = port
        self.debug = debug

        self.app = Flask(__name__)

        self.syntactic_simplifier = SyntacticSimplifier()
        self.enricher = Enricher()

        @self.app.route('/')
        def index():
            return render_template('index.html')

        @self.app.route('/enrich')
        def enrich():
            return render_template('enrich.html')

        @self.app.route('/lexus')
        def lexus():
            return render_template('lexus.html')

        @self.app.route('/syntax')
        def syntax():
            return render_template('syntax.html')

        @self.app.route('/readability')
        def readability():
            return render_template('readability.html')

        @self.app.route('/api/simplify')
        def simplify_api():
            text = request.args['text']
            n = request.args['n']

            lex_result = LexicalSimplifier.simplify(text, n)
            syn_result = self.syntactic_simplifier.simplify(text)

            result = {
                "lexical": lex_result,
                "syntactic": syn_result
            }

            return jsonify(success=True, result=result)

        @self.app.route('/api/lexus/simplify')
        def lexus_simplify_api():
            text = request.args['text']
            n = request.args['n']

            result = LexicalSimplifier.simplify(text, n)
            return jsonify(success=True, result=result)

        @self.app.route('/api/syntax/simplify')
        def syntax_simplify_api():
            text = request.args['text']

            result = self.syntactic_simplifier.simplify(text, False, True)
            return jsonify(success=True, result=result)

        @self.app.route('/api/enrich')
        def enrich_api():
            text = request.args['text']
            result = self.enricher.enrich(text)
            return jsonify(success=True, result=result)

        @self.app.route('/api/readability')
        def readability_api():
            text = request.args['text']
            result = {
                "flesch_kincaid_grade_level": FleschKincaid.calculate_grade_level(text)
            }
            return jsonify(success=True, result=result)

        Logger.log_success("Started application server successfully")