def printHTML(self, page, codeLinkTo='all'):
        """ Prints a table row for a post """

        # codeLinkTo specifies whether should link to code page for all threads or code page for its thread
        page.tr(id=str(self.postID))
        page.td()
        page.a(self.poster, href="{}.html".format(urlSafe(self.poster)))
        page.td.close()
        page.td()
        page.a(self.text,
               href=self.thread.outFileBase + '.html#' + str(self.postID))
        page.td.close()
        page.td()
        for i, code in enumerate(self.codes):
            if (i > 0):
                #page.add('  -  ')
                page.br()
                page.br()
            if (codeLinkTo == 'all'):
                page.a(code, href=urlSafe(code) + '.html')
            elif (codeLinkTo == 'this_interview'):
                page.a(code,
                       href=urlSafe(code) + '_' + self.thread.title + '.html')
            else:
                raise NameError(
                    'invalid parameter: codeLinkTo needs to be all or this_interview'
                )

        page.td.close()
        page.tr.close()
def genCodePostsHTMLReddit(threads, outputdir, code, project_title):
    """ Generates the posts tab of a code page """

    with open("{}/html/{}.html".format(outputdir, urlSafe(code)),
              mode="w") as outFile:
        header = "All posts in {} tagged with {}".format(project_title, code)
        page = markup.page()
        page = genHeaderMenu(page, header)

        page.div(class_="submenu")
        page.a("quotes", color="blue", href="{}.html".format(urlSafe(code)))
        page.add("  -  ")
        page.a("interviews",
               color="blue",
               href="{}_interviews.html".format(urlSafe(code)))
        page.div.close()

        page.table(style="width: 100%; table-layout: fixed; max-width: 90vw")

        page.tr(class_="table-header")
        page.th('speaker', width="15%")
        page.th('text', width="50%")
        page.th('codes', width="20%")
        page.tr.close()

        for thread in threads:
            for post in thread.posts:
                if (code in post.codes):
                    post.printHTML(page)

        page.table.close()

        outFile.write(str(page))
    def __init__(self, title, outFileDir):
        """ Returns a Thread object whose title is title, whose output files have path prefix outFileDir, and whose basename should be outFileBase """

        self.title = urlSafe(title)
        self.outFileDir = outFileDir
        self.outFileBase = urlSafe(title)
        self.posts = []
        self.codeHistogram = defaultdict(int)
def genHistograms(threads, outputdir, codeCounts, project_title):
    """ Generates an index linking to all the main pages.
			Inputs:
				threads <list>: list of thread objects
				outputdir <str>: directory for output specified in arguments
				codeCounts <dict>: counts per code, processed in readOriginalCSVs
					{
						code1: {
							'threads': <set> of the distinct titles of threads with this code,
							'posts': <int> count of number of posts for this code,
							'posters': <set> of the distinct posters who said something with this code,
						}
						...
					}
				project_title <str>: used to generate the page title in the format "<project_title>: Coded Transcripts"
			Outputs:
				Writes index to file, does not return
	"""
    freqSortedCodeCounts = sorted(codeCounts.items(),
                                  key=lambda tup: len(tup[1]['threads']),
                                  reverse=True)

    with open(outputdir + '/html/' + 'histograms.html', mode='w+') as outFile:
        header = "{}: Histograms".format(project_title)
        page = markup.page()
        page = genHeaderMenu(page, header)

        page.table(style="width: 100%", id_="histograms-table")

        page.tr(class_="table-header")
        page.th('code')
        page.th('# distinct interviews')
        page.th('# distinct quotes')
        page.th('# distinct speakers')
        page.th('speakers')
        page.tr.close()
        for freqSortedCodeCount in freqSortedCodeCounts:
            code = freqSortedCodeCount[0]
            posters = freqSortedCodeCount[1]['posters']
            threads = freqSortedCodeCount[1]['threads']
            post_count = freqSortedCodeCount[1]['posts']
            page.tr()
            page.td()
            page.a(code, href="{}.html".format(urlSafe(code)))
            page.td.close()
            page.td(str(len(threads)))
            page.td(str(post_count))
            page.td(str(len(posters)))
            page.td(class_="histogram-posters")
            for poster in posters:
                page.a(poster, href="{}.html".format(urlSafe(poster)))
            page.td.close()
            page.tr.close()
        page.table.close()

        outFile.write(str(page))
    outFile.close()
示例#5
0
def reformat(in_folder_name, out_folder_name, codes, codeCorrections):
    for filename in os.listdir(in_folder_name):
        participantID = filename[:-4]
        if filename == '.DS_Store':
            pass
        elif '.csv' not in filename:  # it's a directory, so recursively call
            reformat(in_folder_name + filename + '/', out_folder_name, codes,
                     codeCorrections)
        else:  # it's a file
            infile_name = in_folder_name + filename
            with open(infile_name) as infile:
                num_codes = 0
                outfile_name = out_folder_name + \
                    urlSafe("{}.csv".format(participantID))
                with open(outfile_name, mode="w+") as outfile:
                    print("\ncreating " + outfile_name)
                outfile.close()
                for i, line in enumerate(infile):
                    if line.replace(',', '').strip() == '':
                        continue
                    elif i == 0:
                        # Use the first line to get the number of commas in the header row to get the variable number of tags.
                        num_codes = len(line.split(',')[1:]) - 1
                        print(outfile_name +
                              ' num_codes: {}'.format(num_codes))
                    else:
                        codeCorrections = add_line(line, outfile_name,
                                                   num_codes, codes,
                                                   codeCorrections)
            infile.close()
def genPosterPostsHTML(poster, outputdir):
    """ For a given poster, generate their posts page """
    username = urlSafe(poster.name)
    with open("{}/html/{}_quotes.html".format(outputdir, username),
              mode="w+") as outfile:
        header = "All coded activity for poster {}".format(username)
        page = markup.page()
        page = genHeaderMenu(page, header)

        page.div(class_="submenu")
        page.a("codes", color="blue", href="{}.html".format(username))
        page.add("&nbsp;&nbsp;-&nbsp;&nbsp;")
        page.a("interviews",
               color="blue",
               href="{}_interviews.html".format(username))
        page.add("&nbsp;&nbsp;-&nbsp;&nbsp;")
        page.a("quotes", color="blue", href="{}_quotes.html".format(username))
        page.div.close()

        page.table(style="width: 100%; table-layout: fixed")

        # First write a block for all the codes the poster engages with, and how often they posted something with that code

        page.tr(class_="table-header")
        page.add("<h1>quotes (n={})</h1>".format(len(poster.threads)))
        page.tr.close()

        for post in poster.posts:
            post.printHTML(page, codeLinkTo="this_interview")

        page.table.close()
        outfile.write(str(page))
    outfile.close()
def genCodeThreadsHTML(threads, outputdir, code, project_title):
    """ Generates the threads tab of a code page """

    with open("{}/html/{}_interviews.html".format(outputdir, urlSafe(code)),
              mode="w") as outFile:
        header = "All threads in {} tagged with {}".format(project_title, code)
        page = markup.page()
        page = genHeaderMenu(page, header)

        sorted_threads = sorted(
            [thread for thread in threads if code in thread.codeHistogram],
            key=lambda thread: thread.codeHistogram[code],
            reverse=True)

        page.div(class_="submenu")
        page.a("quotes", color="blue", href="{}.html".format(urlSafe(code)))
        page.add("&nbsp;&nbsp;-&nbsp;&nbsp;")
        page.a("interviews (n={})".format(len(sorted_threads)),
               color="blue",
               href="{}_interviews.html".format(urlSafe(code)))
        page.div.close()

        page.table(style="width: 100%; table-layout: fixed; max-width: 90vw")

        page.tr(class_="table-header")
        page.th('interview', width="50%")
        page.th('# quotes with this code', width="15%")
        page.tr.close()

        for thread in sorted_threads:
            page.tr()
            # Thread title
            page.td()
            page.a(thread.title,
                   href="{}_{}.html".format(urlSafe(code), thread.title))
            page.td.close()
            # Posts with this code
            page.td(thread.codeHistogram[code])
            page.tr.close()

        page.table.close()

        outFile.write(str(page))
def genCodeCSV(threads, outputdir, code):
    """ Searches through all threads and extracts all references to each code, writes to a CSV output """

    with open(outputdir + '/csv/' + urlSafe(code) + '.csv', 'w') as outFile:
        fields = ['thread', 'postID', 'speaker', 'text', 'code']

        writer = csv.writer(outFile, dialect='excel')

        writer.writerow(fields)

        for thread in threads:
            for post in thread.posts:
                if (code in post.codes):
                    row = [thread.title, post.postID, post.poster, post.text]
                    row.extend(post.codes)
                    writer.writerow(row)
def genPosterCodesHTML(poster, outputdir):
    """ For a given poster, generate their codes page """
    username = urlSafe(poster.name)
    with open("{}/html/{}.html".format(outputdir, username),
              mode="w+") as outfile:
        header = "All coded activity for poster {}".format(username)
        page = markup.page()
        page = genHeaderMenu(page, header)

        page.div(class_="submenu")
        page.a("codes", color="blue", href="{}.html".format(username))
        page.add("&nbsp;&nbsp;-&nbsp;&nbsp;")
        page.a("interviews",
               color="blue",
               href="{}_interviews.html".format(username))
        page.add("&nbsp;&nbsp;-&nbsp;&nbsp;")
        page.a("quotes", color="blue", href="{}_quotes.html".format(username))
        page.div.close()

        page.table(style="width: 100%; table-layout: fixed")

        # First write a block for all the codes the poster engages with, and how often they posted something with that code

        page.tr(class_="table-header")
        page.add("<h1>codes (n={})</h1>".format(len(poster.codes)))
        page.tr.close()

        page.tr(class_="table-header")
        page.th("code")
        page.th("count")
        page.tr.close()

        freq_sorted_code_counts = sorted(poster.codes.items(),
                                         key=lambda tup: tup[1],
                                         reverse=True)

        for code, count in freq_sorted_code_counts:
            page.tr(class_="poster-code")
            page.td()
            page.a(code, href="{}.html".format(code))
            page.td.close()
            page.td(count)
            page.tr.close()

        page.table.close()
        outfile.write(str(page))
    outfile.close()
示例#10
0
def genCodePerTransHTML(threads, outputdir, code):
    """ For each thread, output a page for each code with all the posts coded as such """

    for thread in threads:
        with open(
                outputdir + '/html/' + urlSafe(code) + '_' + thread.title +
                '.html', 'w') as outFile:
            header = "All references to {} in interview {}".format(
                code, thread.title)
            page = markup.page()
            page = genHeaderMenu(page, header)

            page.table(style="width: 100%")

            for post in thread.posts:
                if (code in post.codes):
                    post.printHTML(page)

            page.table.close()

            outFile.write(str(page))
示例#11
0
def add_line(line, outfile_name, num_codes, allCodes, codeCorrections):
    with open(outfile_name, mode="a+") as outfile:
        comma_split = line.strip().split(',')
        # Hacks for codes that contained commas for the Remote Clinic study
        # Retaining to highlight an example of what to do if you code too liberally
        if 'Consultant unfamiliarity with specific platforms' in line:
            i = comma_split.index(
                '"Consultant unfamiliarity with specific platforms (e.g. Android vs. iOS'
            )
            joined_code = " / ".join(comma_split[i:i + 2])
            comma_split[i] = joined_code
            comma_split = comma_split[:i + 1] + comma_split[i + 2:]
        # General code merging
        codes = comma_split[-num_codes:]
        merged_codes = list()
        for code in codes:
            if code.strip() == "":
                merged_code = ""
            else:
                strippedCode = urlSafe(stripQuotesSpace(code))
                if strippedCode not in allCodes:
                    merged_code, codeCorrections = mergeCodes(strippedCode,
                                                              allCodes,
                                                              codeCorrections,
                                                              skip=False)
                else:
                    merged_code = strippedCode
            merged_codes.append(merged_code)
        speaker = comma_split[0]
        utt = sanitize(",".join(comma_split[1:-num_codes]))
        if speaker != '' and utt != '':
            outfile_line = '{} =DELIM= {} =DELIM= '.format(speaker, utt)
            for merged_code in merged_codes:
                outfile_line += '{}, '.format(merged_code)
            outfile.write(outfile_line + '\n')
    outfile.close()
    return codeCorrections
示例#12
0
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            if (not os.path.isdir(outputdir)):
                print("Error: outputdir specified as", outputdir,
                      "exists but is not a directory")
                raise

    # Then the inputdir
    try:
        os.makedirs(inputdir)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            if (not os.path.isdir(outputdir)):
                print("Error: inputdir specified as", outputdir,
                      "exists but is not a directory")
                raise

    # Then extract codes from the codebook
    codes = []
    codeCorrections = {}
    with open(args['c'], 'r') as codeFile:
        # Read in the codes
        codeReader = csv.reader(codeFile, dialect='excel')
        for row in codeReader:
            # first value is code, second is description. We ignore description for now
            code = urlSafe(stripQuotesSpace(row[0]))
            if (code != ''):
                codes.append(code)

    reformat(inputdir, outputdir, codes, codeCorrections)
def main():
    parser = argparse.ArgumentParser(
        description='Process coded transcripts given a codebook.')
    parser.add_argument('-u',
                        '--update',
                        type=str,
                        help="update the indicated master.csv")
    parser.add_argument('project', metavar="project", help="name of project")
    parser.add_argument(
        'outputdir',
        metavar='outputdir',
        help=
        "directory where outputs will be sent. If it doesn't exist it will be created"
    )
    parser.add_argument('codebook',
                        metavar='codebook',
                        help='the codebook CSV file')
    parser.add_argument(
        'transcripts',
        metavar='transcripts',
        help='one or more transcript CSV files, or a directory',
        nargs='+')
    #parser.add_argument('output', metavar='output', help='the output, processed CSV file')
    args = vars(parser.parse_args())

    outputdir = args['outputdir']
    if outputdir[-1] == '/':
        outputdir = outputdir[:-1]

    project_title = args['project']

    # Check outputdir, make subfolders
    try:
        os.makedirs(outputdir + '/html/')
        os.makedirs(outputdir + '/csv/')
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            if (not os.path.isdir(outputdir)):
                print("Error: outputdir specified as", outputdir,
                      "exists but is not a directory")
                raise

    codes = []
    codeCounts = {}
    with open(args['codebook'], 'r') as codeFile:
        # Read in the codes
        codeReader = csv.reader(codeFile, dialect='excel')
        for row in codeReader:
            # first value is code, second is description. We ignore description for now
            code = urlSafe(stripQuotesSpace(row[0]))
            if (code != ''):
                codes.append(code)
                codeCounts[code] = {
                    'posters': set(),
                    'threads': set(),
                    'posts': 0
                }

        # Is this an update?
        if (args['update']):
            threads = readGeneratedCSVs(args['update'], args['transcripts'],
                                        codes, outputdir, codeCounts)
        else:
            transcripts_path = Path(args['transcripts'][0])
            # Are we analyzing an entire directory?
            if transcripts_path.is_dir():
                originalCSVs = [
                    args['transcripts'][0] + path.name
                    for path in Path(args['transcripts'][0]).glob('*.csv')
                ]
                print('Processing directory: ', originalCSVs)
            else:
                originalCSVs = args['transcripts']

            # Read the original CSVs
            threads, codeCounts, posters = readOriginalCSVs(
                originalCSVs, codes, outputdir, codeCounts)

            # Generate a histogram HTML page
            genHistograms(threads, outputdir, codeCounts, project_title)

            # Write code_counts.csv
            genCodeCounts(codeCounts, outputdir)

        # Write out a master CSV
        genMasterCSV(outputdir + '/csv/master.csv', threads)

        # Write out individual posters' pages. TODO: make it an instance method?
        genPosterHTML(posters, outputdir)

        # Write out an interview HTML page
        for interview in threads:
            interview.toHTML()

        # Write out individual HTML for each code
        for code in codes:
            genCodeHTML(threads, outputdir, code, project_title)

        # Write out individual CSV's for each code
        for code in codes:
            genCodeCSV(threads, outputdir, code)

        # Write out individual HTML for each code, interview pair
        for code in codes:
            genCodePerTransHTML(threads, outputdir, code)

        # Generate the main index.html
        genIndex(threads, outputdir, codeCounts, project_title)

        # Generate the stylesheet from the main one
        genStylesheet(outputdir)

        # Print a direct link to the index file for viewing
        print('\nDone! View output at: {}'.format(
            os.path.abspath(outputdir + '/html/index.html')))
def readOriginalCSVs(originalCSVs, allCodes, outputdir, codeCounts):
    """ Reads in CSVs in their original post-Google spreadsheet form """

    numThreads = 0
    numPosts = 0
    threads = []
    allCodeCorrections = {}
    allPosters = {}

    for originalCSV in originalCSVs:
        with open(originalCSV, 'r') as transFile:
            threadFileName = os.path.splitext(os.path.basename(originalCSV))[0]

            # Read in thread, populate new Thread object
            numThreads += 1
            thread = Thread(threadFileName, outputdir)

            for line in transFile:
                # Our reformatted threads have the format:
                # speaker =DELIM= utterance =DELIM= tag, tag, tag, ...
                (poster, text, tags) = line.split('=DELIM=')
                if (text != ''):
                    numPosts += 1

                    # Process post codes
                    post_codes = tags.split(', ')
                    strippedCodes = []
                    for code in post_codes:
                        strippedCode = urlSafe(stripQuotesSpace(code))
                        if (strippedCode == ''):
                            continue
                        if (strippedCode not in allCodes):
                            correctedCode, allCodeCorrections = mergeCodes(
                                strippedCode,
                                allCodes,
                                allCodeCorrections,
                                skip=True
                            )  #set skip to false to correct codes to nearest code by edit distance
                            if (correctedCode == ''):
                                print("Skipping unrecognized code in '" +
                                      strippedCode + "' in file " +
                                      thread.title +
                                      " that could not be merged")
                                continue
                            strippedCode = correctedCode
                        strippedCodes.append(strippedCode)
                        # Add the poster to the set of people who have said something with this code
                        codeCounts[strippedCode]['posters'].add(poster)
                        # Add the thread to the set of threads that have used this code
                        codeCounts[strippedCode]['threads'].add(thread.title)
                        # Increment the counter of posts with this code
                        codeCounts[strippedCode]['posts'] += 1
                        # Add this code to the thread's code histogram
                        thread.codeHistogram[strippedCode] += 1

                    post = Post(thread, numPosts, poster, text, strippedCodes)
                    thread.addPost(post)

                    # Process poster
                    if poster not in allPosters:
                        allPosters[poster] = Poster(poster)

                    allPosters[poster].addToPosts(post)
                    allPosters[poster].addToThreads(thread.title)
                    allPosters[poster].addToCodeCounts(strippedCodes)

            threads.append(thread)

        transFile.close()

    return threads, codeCounts, allPosters
示例#15
0
def genIndex(threads, outputdir, codeCounts, project_title):
    """ Generates an index linking to all the main pages.
			Inputs:
				threads <list>: list of thread objects
				outputdir <str>: directory for output specified in arguments
				codeCounts <dict>: counts per code, processed in readOriginalCSVs
					{
						code1: {
							'threads': <set> of the distinct titles of threads with this code,
							'posts': <int> count of number of posts for this code,
							'posters': <set> of the distinct posters who said something with this code,
						}
						...
					}
				project_title <str>: used to generate the page title in the format "<project_title>: Coded Transcripts"
			Outputs:
				Writes index to file, does not return
	"""

    freqSortedCodes = sorted(codeCounts.items(),
                             key=lambda tup: len(tup[1]['threads']),
                             reverse=True)

    with open(outputdir + '/html/' + 'index.html', 'w') as outFile:
        header = "{}: Coded Transcripts".format(project_title)
        page = markup.page()
        page = genHeaderMenu(page, header)

        page.table(style="width: 100%")

        # Write codes header
        page.tr()
        page.td(class_="index-header")
        page.add('<h1>codes (n={})</h1>'.format(len(freqSortedCodes)))
        page.td.close()
        page.tr.close()

        # Write sorted list of codes with frequencies
        page.tr()
        page.td(class_="index-codes")
        for codeFreqPair in freqSortedCodes:
            code = codeFreqPair[0]
            post_count = codeFreqPair[1]['posts']
            thread_count = len(codeFreqPair[1]['threads'])
            page.div(class_="index-code")
            page.a(code, href=urlSafe(code) + '.html')
            page.add(' &nbsp;&nbsp;(quotes={}, interviews={})'.format(
                post_count, thread_count))
            page.div.close()
            #page.add('&nbsp;&nbsp;-&nbsp;&nbsp;')
        page.td.close()
        page.tr.close()

        num_posts = 0
        for thread in threads:
            num_posts += len(thread.posts)

        # Write threads header
        page.tr()
        page.td(class_="index-header")
        page.add('<h1>interviews (n={}, quotes={})</h1>'.format(
            len(threads), num_posts))
        page.td.close()
        page.tr.close()

        # Write sorted list of threads
        page.tr()
        page.td(class_="index-threads")
        sorted_threads = sorted(threads, key=lambda x: x.title)
        for thread in sorted_threads:
            page.a(thread.title, href=thread.outFileBase + '.html')
            page.br()
        page.td.close()
        page.tr.close()

        page.table.close()

        outFile.write(str(page))
    outFile.close()