예제 #1
0
파일: twic_text.py 프로젝트: jarmoza/twic2
    def GatherTexts(user_source_dir, corpus_source_dir, remove_old_plaintext = False):

        # Make sure passed in paths are formatted correctly
        user_source_dir = Utils_MalletInterpret.FormatPath(user_source_dir)
        corpus_source_dir = Utils_MalletInterpret.FormatPath(corpus_source_dir)

        # Copy files from user source directory to TWiC's corpus source directory
        source_file_counter = 1
        for user_filename in glob.glob(user_source_dir + "*.txt"):

            user_text = TWiC_Text(user_filename)
            #user_text.ConvertToPlainText("{0}{1}_{2}{3}".format(corpus_source_dir,\
            #                                                    source_file_counter,\
            #                                                    user_text.GetFilename(),\
            #                                                    user_text.GetFileExtension()))
            #source_file_counter += 1
            source_file_counter += user_text.ConvertToPlainText_Chunks(corpus_source_dir, source_file_counter)
예제 #2
0
    def Build_TextObjects(TextClass, mallet_script, tp_collection):

        textobj_collection = []
        for current_tp in tp_collection:

            filename = Utils_MalletInterpret.GetFilename(current_tp.filename)
            textobj_collection.append(TextClass('{0}{1}.tei'.format(mallet_script.tei_source, filename)))

        return textobj_collection
예제 #3
0
파일: twic_text.py 프로젝트: jarmoza/twic2
    def ConvertToPlainText_Chunks(self, p_output_dir, p_file_number, p_chunk=True, p_chunk_size=5000):

        file_name = self.GetFilename()
        file_ext = self.GetFileExtension()
        output_lines = self.GetPreparedLines()

        # Optional line chunking
        chunks = []
        if p_chunk:
            chunks = Utils_MalletInterpret.GetChunkedLines(output_lines, p_chunk_size)
        else:
            chunks.append(output_lines)

        # Write out files
        for index in range(len(chunks)):
            with open("{0}{1}_{2}_{3}{4}".format(p_output_dir, p_file_number, file_name, index, file_ext), 'w') as plaintext_output_file:
                for line in chunks[index]:
                    plaintext_output_file.write(unidecode(line) + u"\n")
            p_file_number += 1

        return len(chunks)
예제 #4
0
    def InterpretMalletOutput(mallet_script):

        print "Interpreting MALLET output for TWiC visualization..."

        myoutput_dir = os.path.join("..", "..", "..", "data", "input" + os.sep)

        print "\tReading in MALLET output..."

        ####### 1. Reading corpus.topics.tsv

        print "\t\tLoading topics for texts..."

        # tp_collection = mallet_script.GetTopicsFileData("2.0.7")
        tp_collection = mallet_script.GetTopicsFileData("2.0.9")

        ###### 2. Reading corpus.keys.tsv

        print "\t\tLoading topic keys..."

        topic_keys = mallet_script.GetKeysFileData()

        ###### 3. Reading corpus.topic-state.tsv

        print "\t\tLoading topic words state file..."

        fwt_collection = mallet_script.GetStateFileData()

        ###### 4. Reading corpus.wordweights.tsv

        print "\t\tLoading topic word weights..."

        ww_table = mallet_script.GetTopicWordWeights()

        ###### 5. Build a text object for each text

        print "\tBuilding text objects..."

        # start_time = int(round(time.time() * 1000))
        # textobj_collection = TWiC_MalletInterpret.Build_TextObjects(TWiC_Poem, mallet_script, tp_collection)
        # end_time = int(round(time.time() * 1000))
        # print "Unoptimized: {0}ms".format(end_time - start_time)

        # start_time = int(round(time.time() * 1000))
        # text_obj_collection_opt = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Poem, mallet_script, tp_collection)
        textobj_collection = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Text, mallet_script, tp_collection)
        # end_time = int(round(time.time() * 1000))
        # print "Optimized: {0}ms".format(end_time - start_time)

        ###### 6. Generate a list of unique colors for each topic

        print "\tCreating color list..."

        color_list = Utils_Color.Get_UniqueColorList(len(topic_keys.corpus_topic_proportions.keys()))
        # color_list = Utils_Color.HCL_Fluo

        ###### 7. Build HTML and JSON files for each text for low and mid level TWiC representations

        print "\tCreating JSON files for TWiC views of individual texts..."

        for text in textobj_collection:
            current_tp = None
            for tp in tp_collection:
                if text.GetFilename().split("_")[0] == Utils_MalletInterpret.GetFilenameWithUnderscore(tp.filename):
                    current_tp = tp
                    break
            # TWiC_MalletInterpret.Build_HTMLandJSONForText(text, myoutput_dir, "{0}.css".format(mallet_script.corpus_name), \
            #                          current_tp, fwt_collection, topic_keys, color_list, mallet_script, True)
            TWiC_MalletInterpret.Build_JSONForTextwithForeignObject(text, myoutput_dir, "{0}.css".format(mallet_script.corpus_name), \
                                     current_tp, fwt_collection, topic_keys, color_list, mallet_script, True)


        ###### 8. Build JSON files for visualization

        print "\tBuilding corpus-level JSON map files..."

        # Build a json that shows the hierarchy of Corpus -> Text clusters -> Texts based on Jensen-Shannon Distance
        TWiC_MalletInterpret.Build_CorpusMapJSON_Avg(mallet_script.corpus_title, topic_keys.corpus_topic_proportions, tp_collection, myoutput_dir + "json" + os.sep)

        # Output a JSON of the topic-color list
        # TWiC_MalletInterpret.Build_TopicColorMapJSON(color_list, myoutput_dir + "json/")

        # Generate topic list JSON based on the used_topics_list
        # TWiC_MalletInterpret.Build_TopicWordsJSON(topic_keys, myoutput_dir + "json/")

        # New JSON format for client side
        TWiC_MalletInterpret.Build_CorpusInfoJSON(mallet_script.corpus_title, textobj_collection, tp_collection, topic_keys, color_list, myoutput_dir + "json" + os.sep)

        # Build a json that lists the distribution weights of words likely to appear in each topic
        TWiC_MalletInterpret.Build_WordWeightJSON(ww_table, myoutput_dir + "json" + os.sep)

        print "Finished processing {0} for TWiC.".format(mallet_script.corpus_title)
예제 #5
0
    def Build_CorpusInfoJSON(corpus_title, text_collection, tp_collection, topic_keys, color_list, output_dir):

        # Output JSON format
        # {
        #     "topic_info" : [ # indexed by int topic ID number
        #        [
        #            ["habit", "dash", "torn",...], # topic words
        #            022440" # hex color
        #        ],...
        #     ],
        #     "corpus_info" : [
        #        "Corpus Title",
        #        [0.5, 0.2, ...] # topic proportions [topic0,...topicN]
        #     ],
        #     "file_info" : {
        #       "0" : [ # indexed by str file ID number
        #           "Filename",
        #           "Text Title",
        #           [0.5, 0.2, ...], # topic proportions [topic0,...topicN]
        #           3, # stanza count
        #           65, # line count
        #           400 # word count
        #       ],...
        #     }
        # }

        # Indexers Info

        # "topic_info" (indexed by int topic ID)
        topic_info = "topic_info"
        TI_TopicWords = 0
        TI_Color = 1

        # "corpus_info"
        corpus_info = "corpus_info"
        CI_CorpusTitle = 0
        CI_TopicProportions = 1

        # "file_info" (indexed by str numeric file ID)
        file_info = "file_info"
        FI_Filename = 0
        FI_TextTitle = 1
        FI_TopicProportions = 2
        FI_StanzaCount = 3
        FI_LineCount = 4
        FI_WordCount = 5
        FI_FieldCount = 6

        json_output = { topic_info : [ ], corpus_info : ["", []], file_info : { } }
        topic_count = len(topic_keys.corpus_topic_proportions.keys())

        # Fill out topic_info and corpus_info
        json_output[corpus_info][CI_CorpusTitle] = corpus_title
        json_output[corpus_info][CI_TopicProportions] = [topic_keys.corpus_topic_proportions[str(topic_index)]
                                                         for topic_index in range(topic_count)]
        json_output[topic_info] = [[topic_keys.corpus_topic_words[str(topic_index)] for topic_index in range(topic_count)],
                                   [color_list[topic_index] for topic_index in range(topic_count)]]

        # print "COLOR LIST COMPLETION"
        # print [color_list[topic_index] for topic_index in range(topic_count)]

        # Fill out file_info
        for tp in tp_collection:
            json_output[file_info][tp.fileid] = []
            json_output[file_info][tp.fileid] = [0 for index in range(FI_FieldCount)]
            # json_output[file_info][tp.fileid][FI_Filename] = tp.filename
            json_output[file_info][tp.fileid][FI_Filename] = Utils_MalletInterpret.GetFilename(tp.filename)
            json_output[file_info][tp.fileid][FI_TopicProportions] = []
            for topic_index in range(0, topic_count):
                json_output[file_info][tp.fileid][FI_TopicProportions].append(tp.topic_guide[str(topic_index)])
        for text in text_collection:
            text_filename = text.GetFilename()
            for fileid in json_output[file_info].keys():
                if text_filename == json_output[file_info][fileid][FI_Filename]:
                    json_output[file_info][fileid][FI_TextTitle] = text.GetTitle()

        # Still have to fill out
        # FI_StanzaCount = 3
        # FI_LineCount = 4
        # FI_WordCount = 5

        # Output JSON
        with open(output_dir + "twic_corpusinfo.json", "w") as output_file:
            output_file.write(json.dumps(json_output))
예제 #6
0
    def Build_HTMLandJSONForText(text, output_dir, css_filename, current_tp, fwt_collection, topic_keys, color_list, mallet_script, split_filename=False):

        file_id = text.GetFilename()
        if split_filename:
            file_id = text.GetFilename().split("_")[0]
        output_html = open(output_dir + "html" + os.sep + file_id + '.html', 'w')
        output_html.write('<html>\n')
        output_html.write('\t<head>\n')
        output_html.write('\t\t<link rel="stylesheet" type="text/css" href="{0}">\n'.format(css_filename))
        output_html.write('\t</head>\n')
        output_html.write('\t<body>\n')
        output_html.write('\t\t<div class="left">\n')
        output_html.write('\t\t\t<div class="title">\n')
        output_html.write('\t\t\t\t{0}<br>\n'.format(text.GetTitle()))
        output_html.write('\t\t\t</div>\n')
        output_html.write('\t\t\t{0}<br>\n'.format(text.GetPublication()))
        output_html.write('\t\t</div>\n')
        output_html.write('\t\t<div class="center">\n')

        # Create a JSON for each text (for mid-level twic, text tile)

        # Figure out the possible topics for each word based on the topic state file
        current_fwt = None
        for fwt in fwt_collection:
            fwt_file_id = Utils_MalletInterpret.GetFilename(fwt.GetFilename())
            if split_filename:
                fwt_file_id = Utils_MalletInterpret.GetFilenameWithUnderscore(fwt.GetFilename())
            if fwt_file_id == file_id:
                current_fwt = fwt
                break

        # Convert text to JSON readable by the high-level TWiC visualization
        TWiC_MalletInterpret.ConvertTextToJSON(text, output_dir + "json" + os.sep + "texts" + os.sep, mallet_script, current_fwt)

        # Read in the plain text file
        input_file = open(current_tp.filename, 'r')
        data = input_file.readlines()
        input_file.close()

        # If there was no state file entry, output HTML lines without topics
        used_topics_list = []
        if None == current_fwt:
            for line in data:
                output_line = ''
                words = line.split(' ')
                for actual_word_index in range(0, len(words)):
                    output_line += words[actual_word_index] + ' '
                output_line = output_line.strip()
                output_html.write('\t\t\t' + output_line + '<br>\n')
        else:
            statefile_word_index = 0
            for line in data:

                output_line = ''
                words = line.split(' ')
                if statefile_word_index < len(current_fwt.word_info):
                    lowercase_state_word = clean_word(current_fwt.word_info[statefile_word_index].word.lower())

                # Go through each word in the line
                for actual_word_index in range(0, len(words)):

                    # Lowercase only for comparison
                    lowercase_word = clean_word(words[actual_word_index].lower())

                    if statefile_word_index < len(current_fwt.word_info) and \
                       lowercase_word == lowercase_state_word:

                        output_line += '<span title="Topic {0}"><font color="{1}"><b>{2}</b></font></span>'.format(current_fwt.word_info[statefile_word_index].topic,
                            color_list[int(current_fwt.word_info[statefile_word_index].topic)], words[actual_word_index])

                        if current_fwt.word_info[statefile_word_index].topic not in used_topics_list:
                            used_topics_list.append(current_fwt.word_info[statefile_word_index].topic)

                        statefile_word_index += 1
                        if statefile_word_index < len(current_fwt.word_info):
                            lowercase_state_word = clean_word(current_fwt.word_info[statefile_word_index].word.lower())
                    else:
                        output_line += words[actual_word_index]

                    output_line += ' '

                output_line = output_line.strip()
                output_html.write('\t\t\t' + output_line + '<br>\n')

        output_html.write('\t\t</div><br><br>\n')
        output_html.write('\t\t<div class="topics">\n')
        for used_topic in used_topics_list:
            output_html.write('\t\t\t<font color="{0}">Topic {1}: {2}</font><br>\n'.format(color_list[int(used_topic)], used_topic, topic_keys.corpus_topic_words[used_topic]))
        output_html.write('\t\t</div>\n')
        output_html.write('\t</body>\n')
        output_html.write('</html>')
        output_html.close()
예제 #7
0
    def Build_JSONForTextwithForeignObject(text, output_dir, css_filename, current_tp, fwt_collection, topic_keys, color_list, mallet_script, split_filename=False):

        file_id = text.GetFilename()
        if split_filename:
            file_id = text.GetFilename().split("_")[0]

        # Figure out the possible topics for each word based on the topic state file
        current_fwt = None
        for fwt in fwt_collection:
            fwt_file_id = Utils_MalletInterpret.GetFilename(fwt.GetFilename())
            if split_filename:
                fwt_file_id = Utils_MalletInterpret.GetFilenameWithUnderscore(fwt.GetFilename())
            if fwt_file_id == file_id:
                current_fwt = fwt
                break

        # Retrieve json data of a line-word-topic map from ConvertTextToJSON
        json_data = TWiC_MalletInterpret.ConvertTextToJSON(text, output_dir + "json" + os.sep + "texts" + os.sep, mallet_script, current_fwt, False)

        if "No state file data" == json_data:
            print "Warning: No state file data for {0}".format(text.GetFilename())
            return

        # Output text will be partial html to be inserted inside foreignObject tag
        output_text = []

        # Add an initial spacing span between the panel's control bar and the body
        output_text.append("<xhtml:p class=\"text_p\"><xhtml:span class=\"text_edgespan\">&nbsp;</xhtml:span></xhtml:p>")

        # Build up HTML lines that will be inserted as a foreignObject client-side
        #print "FILENAME: {0}".format(text.GetFilename())
        #print "L&C LEN: {0}".format(len(json_data["document"]["lines_and_colors"]))
        #for lc_index in range(len(json_data["document"]["lines_and_colors"])):
        #    print json_data["document"]["lines_and_colors"][lc_index]

        for lc_index in range(len(json_data["document"]["lines_and_colors"])):

            entry = json_data["document"]["lines_and_colors"][lc_index]

            output_text.append("<xhtml:p class=\"text_p\">")
            output_text.append("<xhtml:span class=\"text_edgespan\">&nbsp;&nbsp;&nbsp;&nbsp;</xhtml:span>")
            for index in range(len(entry[0])):
                if str(index) in entry[1]:
                    output_text.append("<xhtml:span class=\"text_coloredword\" style=\"color:{0}\">{1}&nbsp;</xhtml:span>".format(\
                        color_list[int(entry[1][str(index)])], entry[0][index]))
                else:
                    output_text.append("<xhtml:span class=\"text_word\">{0}&nbsp;</xhtml:span>".format(entry[0][index]))
            output_text.append("</xhtml:p>")
            #output_text.append("<xhtml:br></xhtml:br>")

        # Add the foreignObject HTML to the JSON
        json_data["document"]["full_text"] = ''.join([str(output_text_line) for output_text_line in output_text])

        # Save the number of lines in the text for panel height size client-side
        json_data["document"]["line_count"] = len(json_data["document"]["lines_and_colors"])

        # Dereference the lines and colors array for garbage collection
        #json_data["document"].pop("lines_and_colors", None)

        # Write the JSON file for this text
        with open(output_dir + "json" + os.sep + "texts" + os.sep + text.GetFilename() + ".json", 'w') as fileptr:
            #print "Writing {0}".format(output_dir + "json/texts/" + text.GetFilename() + ".json")
            fileptr.write(json.dumps(json_data))
예제 #8
0
def main(args):

    Utils_MalletInterpret.TimeAndRun(Corpus2Vis, args)
예제 #9
0
    def InterpretMalletOutput(mallet_script):

        print 'Interpreting MALLET output for visualization...'

        # myoutput_dir = '/Users/PeregrinePickle/Documents/Programming/Corpora/Dickinson/output/myviz-output/'
        myoutput_dir = os.path.join("..", "..", "..", "data", "dickinson",
                                    "input" + os.sep)

        print '\tReading in MALLET output...'

        ####### 1. Reading dickinson.topics.tsv

        tp_collection = mallet_script.GetTopicsFileData("2.0.9")

        ###### 2. Reading dickinson.keys.tsv

        topic_keys = mallet_script.GetKeysFileData()

        ###### 3. Reading dickinson.topic-state.tsv

        fwt_collection = mallet_script.GetStateFileData()

        ###### 4. Reading dickinson.wordweights.tsv
        ww_table = mallet_script.GetTopicWordWeights()

        ###### 5. Build a text object for each text

        print '\tBuilding text objects...'

        # start_time = int(round(time.time() * 1000))
        # textobj_collection = TWiC_MalletInterpret.Build_TextObjects(TWiC_Poem, mallet_script, tp_collection)
        # end_time = int(round(time.time() * 1000))
        # print 'Unoptimized: {0}ms'.format(end_time - start_time)

        # start_time = int(round(time.time() * 1000))
        # text_obj_collection_opt = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Poem, mallet_script, tp_collection)
        textobj_collection = TWiC_MalletInterpret.Build_TextObjects_Opt(
            TWiC_Poem, mallet_script, tp_collection)
        # end_time = int(round(time.time() * 1000))
        # print 'Optimized: {0}ms'.format(end_time - start_time)

        ###### 6. Generate a list of unique colors for each topic

        print '\tCreating color list...'

        color_list = Utils_Color.Get_UniqueColorList(
            len(topic_keys.corpus_topic_proportions.keys()))

        ###### 7. Build HTML and JSON files for each text for low and mid level TWiC representations

        print '\tCreating JSON files for TWiC views of texts...'

        for text in textobj_collection:
            current_tp = None
            for tp in tp_collection:
                if text.GetFilename() == Utils_MalletInterpret.GetFilename(
                        tp.filename):
                    current_tp = tp
                    break
            TWiC_MalletInterpret.Build_JSONForTextwithForeignObject(text, myoutput_dir, '{0}.css'.format(mallet_script.corpus_name), \
                                     current_tp, fwt_collection, topic_keys, color_list, mallet_script)

        ###### 8. Build JSON files for visualization

        print '\tBuilding JSON map files for TWiC visualization...'

        # Build a json that shows the hierarchy of Corpus -> Text clusters -> Texts based on Jensen-Shannon Distance
        TWiC_MalletInterpret.Build_CorpusMapJSON_Avg(
            mallet_script.corpus_title, topic_keys.corpus_topic_proportions,
            tp_collection, myoutput_dir + "json" + os.sep)

        # Output a JSON of the topic-color list
        # TWiC_MalletInterpret.Build_TopicColorMapJSON(color_list, myoutput_dir + "json/")

        # Generate topic list JSON based on the used_topics_list
        # TWiC_MalletInterpret.Build_TopicWordsJSON(topic_keys, myoutput_dir + "json/")

        # New JSON format for client side
        TWiC_MalletInterpret.Build_CorpusInfoJSON(
            mallet_script.corpus_title, textobj_collection, tp_collection,
            topic_keys, color_list, myoutput_dir + "json" + os.sep)

        # Build a json that lists the distribution weights of words likely to appear in each topic
        TWiC_MalletInterpret.Build_WordWeightJSON(
            ww_table, myoutput_dir + "json" + os.sep)