def printHTML(self, page, codeLinkTo='all'): """ Prints a table row for a post """ # codeLinkTo specifies whether should link to code page for all threads or code page for its thread page.tr(id=str(self.postID)) page.td() page.a(self.poster, href="{}.html".format(urlSafe(self.poster))) page.td.close() page.td() page.a(self.text, href=self.thread.outFileBase + '.html#' + str(self.postID)) page.td.close() page.td() for i, code in enumerate(self.codes): if (i > 0): #page.add(' - ') page.br() page.br() if (codeLinkTo == 'all'): page.a(code, href=urlSafe(code) + '.html') elif (codeLinkTo == 'this_interview'): page.a(code, href=urlSafe(code) + '_' + self.thread.title + '.html') else: raise NameError( 'invalid parameter: codeLinkTo needs to be all or this_interview' ) page.td.close() page.tr.close()
def genCodePostsHTMLReddit(threads, outputdir, code, project_title): """ Generates the posts tab of a code page """ with open("{}/html/{}.html".format(outputdir, urlSafe(code)), mode="w") as outFile: header = "All posts in {} tagged with {}".format(project_title, code) page = markup.page() page = genHeaderMenu(page, header) page.div(class_="submenu") page.a("quotes", color="blue", href="{}.html".format(urlSafe(code))) page.add(" - ") page.a("interviews", color="blue", href="{}_interviews.html".format(urlSafe(code))) page.div.close() page.table(style="width: 100%; table-layout: fixed; max-width: 90vw") page.tr(class_="table-header") page.th('speaker', width="15%") page.th('text', width="50%") page.th('codes', width="20%") page.tr.close() for thread in threads: for post in thread.posts: if (code in post.codes): post.printHTML(page) page.table.close() outFile.write(str(page))
def __init__(self, title, outFileDir): """ Returns a Thread object whose title is title, whose output files have path prefix outFileDir, and whose basename should be outFileBase """ self.title = urlSafe(title) self.outFileDir = outFileDir self.outFileBase = urlSafe(title) self.posts = [] self.codeHistogram = defaultdict(int)
def genHistograms(threads, outputdir, codeCounts, project_title): """ Generates an index linking to all the main pages. Inputs: threads <list>: list of thread objects outputdir <str>: directory for output specified in arguments codeCounts <dict>: counts per code, processed in readOriginalCSVs { code1: { 'threads': <set> of the distinct titles of threads with this code, 'posts': <int> count of number of posts for this code, 'posters': <set> of the distinct posters who said something with this code, } ... } project_title <str>: used to generate the page title in the format "<project_title>: Coded Transcripts" Outputs: Writes index to file, does not return """ freqSortedCodeCounts = sorted(codeCounts.items(), key=lambda tup: len(tup[1]['threads']), reverse=True) with open(outputdir + '/html/' + 'histograms.html', mode='w+') as outFile: header = "{}: Histograms".format(project_title) page = markup.page() page = genHeaderMenu(page, header) page.table(style="width: 100%", id_="histograms-table") page.tr(class_="table-header") page.th('code') page.th('# distinct interviews') page.th('# distinct quotes') page.th('# distinct speakers') page.th('speakers') page.tr.close() for freqSortedCodeCount in freqSortedCodeCounts: code = freqSortedCodeCount[0] posters = freqSortedCodeCount[1]['posters'] threads = freqSortedCodeCount[1]['threads'] post_count = freqSortedCodeCount[1]['posts'] page.tr() page.td() page.a(code, href="{}.html".format(urlSafe(code))) page.td.close() page.td(str(len(threads))) page.td(str(post_count)) page.td(str(len(posters))) page.td(class_="histogram-posters") for poster in posters: page.a(poster, href="{}.html".format(urlSafe(poster))) page.td.close() page.tr.close() page.table.close() outFile.write(str(page)) outFile.close()
def reformat(in_folder_name, out_folder_name, codes, codeCorrections): for filename in os.listdir(in_folder_name): participantID = filename[:-4] if filename == '.DS_Store': pass elif '.csv' not in filename: # it's a directory, so recursively call reformat(in_folder_name + filename + '/', out_folder_name, codes, codeCorrections) else: # it's a file infile_name = in_folder_name + filename with open(infile_name) as infile: num_codes = 0 outfile_name = out_folder_name + \ urlSafe("{}.csv".format(participantID)) with open(outfile_name, mode="w+") as outfile: print("\ncreating " + outfile_name) outfile.close() for i, line in enumerate(infile): if line.replace(',', '').strip() == '': continue elif i == 0: # Use the first line to get the number of commas in the header row to get the variable number of tags. num_codes = len(line.split(',')[1:]) - 1 print(outfile_name + ' num_codes: {}'.format(num_codes)) else: codeCorrections = add_line(line, outfile_name, num_codes, codes, codeCorrections) infile.close()
def genPosterPostsHTML(poster, outputdir): """ For a given poster, generate their posts page """ username = urlSafe(poster.name) with open("{}/html/{}_quotes.html".format(outputdir, username), mode="w+") as outfile: header = "All coded activity for poster {}".format(username) page = markup.page() page = genHeaderMenu(page, header) page.div(class_="submenu") page.a("codes", color="blue", href="{}.html".format(username)) page.add(" - ") page.a("interviews", color="blue", href="{}_interviews.html".format(username)) page.add(" - ") page.a("quotes", color="blue", href="{}_quotes.html".format(username)) page.div.close() page.table(style="width: 100%; table-layout: fixed") # First write a block for all the codes the poster engages with, and how often they posted something with that code page.tr(class_="table-header") page.add("<h1>quotes (n={})</h1>".format(len(poster.threads))) page.tr.close() for post in poster.posts: post.printHTML(page, codeLinkTo="this_interview") page.table.close() outfile.write(str(page)) outfile.close()
def genCodeThreadsHTML(threads, outputdir, code, project_title): """ Generates the threads tab of a code page """ with open("{}/html/{}_interviews.html".format(outputdir, urlSafe(code)), mode="w") as outFile: header = "All threads in {} tagged with {}".format(project_title, code) page = markup.page() page = genHeaderMenu(page, header) sorted_threads = sorted( [thread for thread in threads if code in thread.codeHistogram], key=lambda thread: thread.codeHistogram[code], reverse=True) page.div(class_="submenu") page.a("quotes", color="blue", href="{}.html".format(urlSafe(code))) page.add(" - ") page.a("interviews (n={})".format(len(sorted_threads)), color="blue", href="{}_interviews.html".format(urlSafe(code))) page.div.close() page.table(style="width: 100%; table-layout: fixed; max-width: 90vw") page.tr(class_="table-header") page.th('interview', width="50%") page.th('# quotes with this code', width="15%") page.tr.close() for thread in sorted_threads: page.tr() # Thread title page.td() page.a(thread.title, href="{}_{}.html".format(urlSafe(code), thread.title)) page.td.close() # Posts with this code page.td(thread.codeHistogram[code]) page.tr.close() page.table.close() outFile.write(str(page))
def genCodeCSV(threads, outputdir, code): """ Searches through all threads and extracts all references to each code, writes to a CSV output """ with open(outputdir + '/csv/' + urlSafe(code) + '.csv', 'w') as outFile: fields = ['thread', 'postID', 'speaker', 'text', 'code'] writer = csv.writer(outFile, dialect='excel') writer.writerow(fields) for thread in threads: for post in thread.posts: if (code in post.codes): row = [thread.title, post.postID, post.poster, post.text] row.extend(post.codes) writer.writerow(row)
def genPosterCodesHTML(poster, outputdir): """ For a given poster, generate their codes page """ username = urlSafe(poster.name) with open("{}/html/{}.html".format(outputdir, username), mode="w+") as outfile: header = "All coded activity for poster {}".format(username) page = markup.page() page = genHeaderMenu(page, header) page.div(class_="submenu") page.a("codes", color="blue", href="{}.html".format(username)) page.add(" - ") page.a("interviews", color="blue", href="{}_interviews.html".format(username)) page.add(" - ") page.a("quotes", color="blue", href="{}_quotes.html".format(username)) page.div.close() page.table(style="width: 100%; table-layout: fixed") # First write a block for all the codes the poster engages with, and how often they posted something with that code page.tr(class_="table-header") page.add("<h1>codes (n={})</h1>".format(len(poster.codes))) page.tr.close() page.tr(class_="table-header") page.th("code") page.th("count") page.tr.close() freq_sorted_code_counts = sorted(poster.codes.items(), key=lambda tup: tup[1], reverse=True) for code, count in freq_sorted_code_counts: page.tr(class_="poster-code") page.td() page.a(code, href="{}.html".format(code)) page.td.close() page.td(count) page.tr.close() page.table.close() outfile.write(str(page)) outfile.close()
def genCodePerTransHTML(threads, outputdir, code): """ For each thread, output a page for each code with all the posts coded as such """ for thread in threads: with open( outputdir + '/html/' + urlSafe(code) + '_' + thread.title + '.html', 'w') as outFile: header = "All references to {} in interview {}".format( code, thread.title) page = markup.page() page = genHeaderMenu(page, header) page.table(style="width: 100%") for post in thread.posts: if (code in post.codes): post.printHTML(page) page.table.close() outFile.write(str(page))
def add_line(line, outfile_name, num_codes, allCodes, codeCorrections): with open(outfile_name, mode="a+") as outfile: comma_split = line.strip().split(',') # Hacks for codes that contained commas for the Remote Clinic study # Retaining to highlight an example of what to do if you code too liberally if 'Consultant unfamiliarity with specific platforms' in line: i = comma_split.index( '"Consultant unfamiliarity with specific platforms (e.g. Android vs. iOS' ) joined_code = " / ".join(comma_split[i:i + 2]) comma_split[i] = joined_code comma_split = comma_split[:i + 1] + comma_split[i + 2:] # General code merging codes = comma_split[-num_codes:] merged_codes = list() for code in codes: if code.strip() == "": merged_code = "" else: strippedCode = urlSafe(stripQuotesSpace(code)) if strippedCode not in allCodes: merged_code, codeCorrections = mergeCodes(strippedCode, allCodes, codeCorrections, skip=False) else: merged_code = strippedCode merged_codes.append(merged_code) speaker = comma_split[0] utt = sanitize(",".join(comma_split[1:-num_codes])) if speaker != '' and utt != '': outfile_line = '{} =DELIM= {} =DELIM= '.format(speaker, utt) for merged_code in merged_codes: outfile_line += '{}, '.format(merged_code) outfile.write(outfile_line + '\n') outfile.close() return codeCorrections
except OSError as exception: if exception.errno != errno.EEXIST: if (not os.path.isdir(outputdir)): print("Error: outputdir specified as", outputdir, "exists but is not a directory") raise # Then the inputdir try: os.makedirs(inputdir) except OSError as exception: if exception.errno != errno.EEXIST: if (not os.path.isdir(outputdir)): print("Error: inputdir specified as", outputdir, "exists but is not a directory") raise # Then extract codes from the codebook codes = [] codeCorrections = {} with open(args['c'], 'r') as codeFile: # Read in the codes codeReader = csv.reader(codeFile, dialect='excel') for row in codeReader: # first value is code, second is description. We ignore description for now code = urlSafe(stripQuotesSpace(row[0])) if (code != ''): codes.append(code) reformat(inputdir, outputdir, codes, codeCorrections)
def main(): parser = argparse.ArgumentParser( description='Process coded transcripts given a codebook.') parser.add_argument('-u', '--update', type=str, help="update the indicated master.csv") parser.add_argument('project', metavar="project", help="name of project") parser.add_argument( 'outputdir', metavar='outputdir', help= "directory where outputs will be sent. If it doesn't exist it will be created" ) parser.add_argument('codebook', metavar='codebook', help='the codebook CSV file') parser.add_argument( 'transcripts', metavar='transcripts', help='one or more transcript CSV files, or a directory', nargs='+') #parser.add_argument('output', metavar='output', help='the output, processed CSV file') args = vars(parser.parse_args()) outputdir = args['outputdir'] if outputdir[-1] == '/': outputdir = outputdir[:-1] project_title = args['project'] # Check outputdir, make subfolders try: os.makedirs(outputdir + '/html/') os.makedirs(outputdir + '/csv/') except OSError as exception: if exception.errno != errno.EEXIST: if (not os.path.isdir(outputdir)): print("Error: outputdir specified as", outputdir, "exists but is not a directory") raise codes = [] codeCounts = {} with open(args['codebook'], 'r') as codeFile: # Read in the codes codeReader = csv.reader(codeFile, dialect='excel') for row in codeReader: # first value is code, second is description. We ignore description for now code = urlSafe(stripQuotesSpace(row[0])) if (code != ''): codes.append(code) codeCounts[code] = { 'posters': set(), 'threads': set(), 'posts': 0 } # Is this an update? if (args['update']): threads = readGeneratedCSVs(args['update'], args['transcripts'], codes, outputdir, codeCounts) else: transcripts_path = Path(args['transcripts'][0]) # Are we analyzing an entire directory? if transcripts_path.is_dir(): originalCSVs = [ args['transcripts'][0] + path.name for path in Path(args['transcripts'][0]).glob('*.csv') ] print('Processing directory: ', originalCSVs) else: originalCSVs = args['transcripts'] # Read the original CSVs threads, codeCounts, posters = readOriginalCSVs( originalCSVs, codes, outputdir, codeCounts) # Generate a histogram HTML page genHistograms(threads, outputdir, codeCounts, project_title) # Write code_counts.csv genCodeCounts(codeCounts, outputdir) # Write out a master CSV genMasterCSV(outputdir + '/csv/master.csv', threads) # Write out individual posters' pages. TODO: make it an instance method? genPosterHTML(posters, outputdir) # Write out an interview HTML page for interview in threads: interview.toHTML() # Write out individual HTML for each code for code in codes: genCodeHTML(threads, outputdir, code, project_title) # Write out individual CSV's for each code for code in codes: genCodeCSV(threads, outputdir, code) # Write out individual HTML for each code, interview pair for code in codes: genCodePerTransHTML(threads, outputdir, code) # Generate the main index.html genIndex(threads, outputdir, codeCounts, project_title) # Generate the stylesheet from the main one genStylesheet(outputdir) # Print a direct link to the index file for viewing print('\nDone! View output at: {}'.format( os.path.abspath(outputdir + '/html/index.html')))
def readOriginalCSVs(originalCSVs, allCodes, outputdir, codeCounts): """ Reads in CSVs in their original post-Google spreadsheet form """ numThreads = 0 numPosts = 0 threads = [] allCodeCorrections = {} allPosters = {} for originalCSV in originalCSVs: with open(originalCSV, 'r') as transFile: threadFileName = os.path.splitext(os.path.basename(originalCSV))[0] # Read in thread, populate new Thread object numThreads += 1 thread = Thread(threadFileName, outputdir) for line in transFile: # Our reformatted threads have the format: # speaker =DELIM= utterance =DELIM= tag, tag, tag, ... (poster, text, tags) = line.split('=DELIM=') if (text != ''): numPosts += 1 # Process post codes post_codes = tags.split(', ') strippedCodes = [] for code in post_codes: strippedCode = urlSafe(stripQuotesSpace(code)) if (strippedCode == ''): continue if (strippedCode not in allCodes): correctedCode, allCodeCorrections = mergeCodes( strippedCode, allCodes, allCodeCorrections, skip=True ) #set skip to false to correct codes to nearest code by edit distance if (correctedCode == ''): print("Skipping unrecognized code in '" + strippedCode + "' in file " + thread.title + " that could not be merged") continue strippedCode = correctedCode strippedCodes.append(strippedCode) # Add the poster to the set of people who have said something with this code codeCounts[strippedCode]['posters'].add(poster) # Add the thread to the set of threads that have used this code codeCounts[strippedCode]['threads'].add(thread.title) # Increment the counter of posts with this code codeCounts[strippedCode]['posts'] += 1 # Add this code to the thread's code histogram thread.codeHistogram[strippedCode] += 1 post = Post(thread, numPosts, poster, text, strippedCodes) thread.addPost(post) # Process poster if poster not in allPosters: allPosters[poster] = Poster(poster) allPosters[poster].addToPosts(post) allPosters[poster].addToThreads(thread.title) allPosters[poster].addToCodeCounts(strippedCodes) threads.append(thread) transFile.close() return threads, codeCounts, allPosters
def genIndex(threads, outputdir, codeCounts, project_title): """ Generates an index linking to all the main pages. Inputs: threads <list>: list of thread objects outputdir <str>: directory for output specified in arguments codeCounts <dict>: counts per code, processed in readOriginalCSVs { code1: { 'threads': <set> of the distinct titles of threads with this code, 'posts': <int> count of number of posts for this code, 'posters': <set> of the distinct posters who said something with this code, } ... } project_title <str>: used to generate the page title in the format "<project_title>: Coded Transcripts" Outputs: Writes index to file, does not return """ freqSortedCodes = sorted(codeCounts.items(), key=lambda tup: len(tup[1]['threads']), reverse=True) with open(outputdir + '/html/' + 'index.html', 'w') as outFile: header = "{}: Coded Transcripts".format(project_title) page = markup.page() page = genHeaderMenu(page, header) page.table(style="width: 100%") # Write codes header page.tr() page.td(class_="index-header") page.add('<h1>codes (n={})</h1>'.format(len(freqSortedCodes))) page.td.close() page.tr.close() # Write sorted list of codes with frequencies page.tr() page.td(class_="index-codes") for codeFreqPair in freqSortedCodes: code = codeFreqPair[0] post_count = codeFreqPair[1]['posts'] thread_count = len(codeFreqPair[1]['threads']) page.div(class_="index-code") page.a(code, href=urlSafe(code) + '.html') page.add(' (quotes={}, interviews={})'.format( post_count, thread_count)) page.div.close() #page.add(' - ') page.td.close() page.tr.close() num_posts = 0 for thread in threads: num_posts += len(thread.posts) # Write threads header page.tr() page.td(class_="index-header") page.add('<h1>interviews (n={}, quotes={})</h1>'.format( len(threads), num_posts)) page.td.close() page.tr.close() # Write sorted list of threads page.tr() page.td(class_="index-threads") sorted_threads = sorted(threads, key=lambda x: x.title) for thread in sorted_threads: page.a(thread.title, href=thread.outFileBase + '.html') page.br() page.td.close() page.tr.close() page.table.close() outFile.write(str(page)) outFile.close()