Exemplo n.º 1
0
def genderize(args):
    print(args)

    #File initialization
    dir_path = os.path.dirname(os.path.realpath(__file__))

    logging.basicConfig(
        filename=dir_path + os.sep + "log.txt",
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s %(name)s %(message)s')
    logger = logging.getLogger(__name__)

    ofilename, ofile_extension = os.path.splitext(args.output)

    ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    ifile = args.input

    if os.path.isabs(ifile):
        print("\n--- Input file: " + ifile)
    else:
        print("\n--- Input file: " + dir_path + os.sep + ifile)

    if os.path.isabs(ofile):
        print("--- Output file: " + ofile)
    else:
        print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    #File integrity checking
    if not os.path.exists(ifile):
        print("--- Input file does not exist. Exiting.\n")
        sys.exit()

    if not os.path.exists(os.path.dirname(ofile)):
        print("--- Error! Invalid output file path. Exiting.\n")
        sys.exit()

    #Some set up stuff
    ##csv.field_size_limit(sys.maxsize)

    #Initialize API key
    if not args.key == "NO_API":
        print("--- API key: " + args.key + "\n")
        genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=args.key)
        key_present = True
    else:
        print("--- No API key provided.\n")
        key_present = False

    #Open ifile
    with open(ifile, 'r', encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True)
        headers = next(readCSV)  # take the first row

        # [Chen]: use re.search to find the specific column name, match string "name"
        specific_header_index = 0
        for header in headers:
            if re.search("name", header, re.I) != None:
                break
            specific_header_index += 1

        # [Chen]: store rows without headers as a list
        rest = list(readCSV)
        # [Chen]: extract the first_name column
        first_name = [row[specific_header_index].strip() for row in rest]

        o_first_name = list()
        for l in first_name:
            for b in l:
                o_first_name.append(b)

        if args.auto == True:
            uniq_first_name = list(set(o_first_name))
            chunks = list(jpyh.splitlist(uniq_first_name, 10))
            print("--- Read CSV with " + str(len(first_name)) +
                  " first_name. " + str(len(uniq_first_name)) + " unique.")
        else:
            chunks = list(jpyh.splitlist(first_name, 10))
            print("--- Read CSV with " + str(len(first_name)) + " first_name")

        print("--- Processed into " + str(len(chunks)) + " chunks")

        if jpyh.query_yes_no(
                "\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no(
                    "---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        if args.auto == True:
            ofile = ofile + ".tmp"

        response_time = []
        gender_responses = list()
        with open(ofile, 'w', newline='', encoding="utf8") as f:
            writer = csv.writer(f)

            # [Chen]: generate different headers by'-ORV' argument
            if args.override == True:
                headers.extend(["female", "male"])
            else:
                headers.extend(["gender", "probability", "count"])
            writer.writerow(headers)
            chunks_len = len(chunks)
            stopped = False

            for index, chunk in enumerate(chunks):
                if stopped:
                    break
                success = False
                while not success:
                    try:
                        start = time.time()

                        if key_present:
                            dataset = genderize.get(chunk)
                        else:
                            dataset = Genderize().get(chunk)

                        gender_responses.append(dataset)
                        success = True
                    except GenderizeException as e:
                        #print("\n" + str(e))
                        logger.error(e)

                        #Error handling
                        if "response not in JSON format" in str(
                                e) and args.catch == True:
                            if jpyh.query_yes_no(
                                    "\n---!! 502 detected, try again?"
                            ) == True:
                                success = False
                                continue
                        elif "Invalid API key" in str(
                                e) and args.catch == True:
                            print(
                                "\n---!! Error, invalid API key! Check log file for details.\n"
                            )
                        else:
                            print(
                                "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n"
                            )
                        stopped = True
                        break

                    response_time.append(time.time() - start)
                    print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                        str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")

                    # [Chen]: Combine original data and the response: if go with '-OVR', convert gender to 0/1; if go without '-OVR', pick up gender, probability, count
                    if args.override == True:
                        for data in dataset:
                            append_response = []
                            if data.get('gender'):  # null check
                                female = 1 if data.get(
                                    'gender') == 'female' else 0
                                append_response = [female, female ^ 1
                                                   ]  # xor for 0 <=> 1
                            else:
                                append_response = [0, 0]
                            writer.writerow([*rest[0], *append_response])
                            rest.pop(0)
                    else:
                        for data in dataset:
                            writer.writerow(
                                [*rest[0], *list(data.values())[1:]])
                            rest.pop(0)
                    break

            if args.auto == True:
                print("\nCompleting identical first_name...\n")
                #AUTOCOMPLETE first_name

                #Create master dict
                gender_dict = dict()

                print(gender_dict)
                for response in gender_responses:
                    for d in response:
                        gender_dict[d.get("name")] = [
                            d.get("gender"),
                            d.get("probability"),
                            d.get("count")
                        ]

                filename, file_extension = os.path.splitext(ofile)
                with open(filename, 'w', newline='', encoding="utf8") as f:
                    writer = csv.writer(f)
                    writer.writerow(
                        list(["first_name", "gender", "probability", "count"]))

                    for name in o_first_name:
                        data = gender_dict.get(name)
                        writer.writerow([name, data[0], data[1], data[2]])
            print("Done!\n")
Exemplo n.º 2
0
def genderize(args):
    print(args)

    #File initialization
    dir_path = os.path.dirname(os.path.realpath(__file__))

    logging.basicConfig(
        filename=dir_path + os.sep + "log.txt",
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s %(name)s %(message)s')
    logger = logging.getLogger(__name__)

    ofilename, ofile_extension = os.path.splitext(args.output)

    ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    ifile = args.input

    if os.path.isabs(ifile):
        print("\n--- Input file: " + ifile)
    else:
        print("\n--- Input file: " + dir_path + os.sep + ifile)

    if os.path.isabs(ofile):
        print("--- Output file: " + ofile)
    else:
        print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    #File integrity checking
    if not os.path.exists(ifile):
        print("--- Input file does not exist. Exiting.\n")
        sys.exit()

    if not os.path.exists(os.path.dirname(ofile)):
        print("--- Error! Invalid output file path. Exiting.\n")
        sys.exit()

    #Some set up stuff
    csv.field_size_limit(sys.maxsize)

    #Initialize API key
    if not args.key == "NO_API":
        print("--- API key: " + args.key + "\n")
        genderize = Genderize(user_agent='GenderizeDocs/0.0',
                              api_key='169f6a8e933dcec15a57235d7fde49d6')
        key_present = True
    else:
        print("--- No API key provided.\n")
        key_present = False

    #Open ifile
    with open(ifile, 'r', encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile,
                             delimiter=',',
                             quotechar='"',
                             skipinitialspace=True)
        line_count = 0
        first_name = []
        users_id = []
        input_fields = []
        rows = []
        input_fields = next(readCSV)
        for row in readCSV:  #Read CSV into first_name list
            rows.append(row)
            line_count += 1

        if len(input_fields) > 1:
            for row in rows:
                first_name.append(row[1])
                users_id.append(row[0])
        else:
            for row in rows:
                first_name.append(row[0])

        if args.noheader == False and first_name[0] in input_fields:
            first_name.pop(0)  #Remove header

        o_first_name = list()
        for l in first_name:
            for b in l:
                o_first_name.append(b)

        if args.auto == True:
            # uniq_first_name = list(set(o_first_name))
            chunks = list(jpyh.splitlist(first_name, 10))
            print("--- Read CSV with " + str(len(first_name)) +
                  " first_name. " + str(len(first_name)) + " unique.")
        else:
            chunks = list(jpyh.splitlist(first_name, 10))
            print("--- Read CSV with " + str(len(first_name)) + " first_name")

        print("--- Processed into " + str(len(chunks)) + " chunks")

        if jpyh.query_yes_no(
                "\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no(
                    "---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        if args.auto == True:
            ofile = ofile + ".tmp"

        response_time = []
        gender_responses = list()
        output_fields = [*input_fields, "gender", "probability", "count"]
        if args.override == False:
            with open(ofile, 'w', newline='', encoding="utf8") as f:
                writer = csv.writer(f)
                writer.writerow(output_fields)
                chunks_len = len(chunks)
                stopped = False
                for index, chunk in enumerate(chunks):
                    if stopped:
                        break
                    success = False
                    while not success:
                        try:
                            start = time.time()

                            if key_present:
                                dataset = genderize.get(chunk)

                            else:
                                dataset = Genderize(
                                    user_agent='GenderizeDocs/0.0',
                                    api_key='169f6a8e933dcec15a57235d7fde49d6'
                                ).get(chunk)

                            gender_responses.append(dataset)
                            success = True
                        except GenderizeException as e:
                            print("\n" + str(e))
                            logger.error(e)

                            #Error handling
                            if "response not in JSON format" in str(
                                    e) and args.catch == True:
                                if jpyh.query_yes_no(
                                        "\n---!! 502 detected, try again?"
                                ) == True:
                                    success = False
                                    continue
                            elif "Invalid API key" in str(
                                    e) and args.catch == True:
                                print(
                                    "\n---!! Error, invalid API key! Check log file for details.\n"
                                )
                            else:
                                print(
                                    "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n"
                                )
                            stopped = True
                            break

                        response_time.append(time.time() - start)
                        print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                            str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")

                        break
                j = 0
                response_arr = list()
                for data in gender_responses:
                    for el in data:
                        response_arr.append(el)

                for row in rows:
                    row.append(response_arr[j]['gender'])
                    row.append(response_arr[j]['probability'])
                    row.append(response_arr[j]['count'])
                    j += 1

                for row in rows:
                    writer.writerow(row)

        if args.auto == True:
            print("\nCompleting identical first_name...\n")
            #AUTOCOMPLETE first_namey

            #Create master dict
            gender_dict = dict()
            for response in gender_responses:
                for d in response:
                    gender_dict[d.get("name")] = [
                        d.get("count"),
                        d.get("gender"),
                        d.get("probability")
                    ]

            filename, file_extension = os.path.splitext(ofile)
            with open(filename, 'w', newline='', encoding="utf8") as f:
                writer = csv.writer(f, delimiter='-')
                writer.writerow(
                    list(["name", "count", "gender", "probability"]))

                for name in o_first_name:
                    data = gender_dict.get(name)
                    writer.writerow([name, data[1], data[1], data[2]])

        if args.override == True:
            output_fields = [*input_fields]
            with open(ofile, 'w', newline='', encoding="utf8") as f:
                writer = csv.writer(f)
                writer.writerow(output_fields)
                chunks_len = len(chunks)
                stopped = False
                for index, chunk in enumerate(chunks):
                    if stopped:
                        break
                    success = False
                    while not success:
                        try:
                            start = time.time()

                            if key_present:
                                dataset = genderize.get(chunk)
                            else:
                                dataset = Genderize(
                                    user_agent='GenderizeDocs/0.0',
                                    api_key='169f6a8e933dcec15a57235d7fde49d6'
                                ).get(chunk)

                            gender_responses.append(dataset)
                            success = True
                        except GenderizeException as e:
                            print("\n" + str(e))
                            logger.error(e)

                            #Error handling
                            if "response not in JSON format" in str(
                                    e) and args.catch == True:
                                if jpyh.query_yes_no(
                                        "\n---!! 502 detected, try again?"
                                ) == True:
                                    success = False
                                    continue
                            elif "Invalid API key" in str(
                                    e) and args.catch == True:
                                print(
                                    "\n---!! Error, invalid API key! Check log file for details.\n"
                                )
                            else:
                                print(
                                    "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n"
                                )
                            stopped = True
                            break

                        response_time.append(time.time() - start)
                        print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                            str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")

                        break

                i = 0
                response_arr = list()
                for data in gender_responses:
                    for el in data:
                        response_arr.append(el)

                index = len(rows)
                for row in rows:
                    # print(rows[-1][-2])
                    if response_arr[i]['gender'] == 'male':
                        rows[i][-1] = "1"
                        rows[i][-2] = "0"
                    else:
                        rows[i][-1] = "0"
                        rows[i][-2] = "1"
                    i += 1

                for row in rows:
                    writer.writerow(row)
            print("Done!\n")
Exemplo n.º 3
0
def genderize(args):
    print(args)

    #File initialization
    dir_path = os.path.dirname(os.path.realpath(__file__))

    logging.basicConfig(filename=dir_path + os.sep + "log.txt", level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(name)s %(message)s')
    logger=logging.getLogger(__name__)

    ofilename, ofile_extension = os.path.splitext(args.output)

    #ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    ofile = ofilename + ofile_extension
    ifile = args.input

    if os.path.isabs(ifile):
        print("\n--- Input file: " + ifile)
    else:
        print("\n--- Input file: " + dir_path + os.sep + ifile)

    if os.path.isabs(ofile):
        print("--- Output file: " + ofile)
    else:
        print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    #File integrity checking
    if not os.path.exists(ifile):
        print("--- Input file does not exist. Exiting.\n")
        sys.exit()

    #if not os.path.exists(os.path.dirname(ofile)):
    if not os.path.exists(ofile):
        print("--- Error! Invalid output file path. Exiting.\n")
        sys.exit()

    #Some set up stuff
    ##csv.field_size_limit(sys.maxsize)

    #Initialize API key
    if not args.key == "NO_API":
        print("--- API key: " + args.key + "\n")
        genderize = Genderize(
            user_agent='GenderizeDocs/0.0',
            api_key=args.key)
        key_present = True
    else:
        print("--- No API key provided.\n")
        key_present = False

    #Open ifile
    with open(ifile, 'r', encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True)
        first_name = []
        inputData = []

        for row in readCSV: #Read CSV into first_name list
            if len(row) > 1:
                row[1] = row[1].strip()
                first_name.append(row[1])
            else:
                row[0] = row[0].strip()
                first_name.append(row[0])
            inputData.append(row)

        headers = inputData[0]
        inputData = inputData[1:]
        data_iterator = iter(inputData)

        if args.noheader == False:
            first_name.pop(0) #Remove header

        o_first_name = list()
        for l in first_name:
            for b in l:
                o_first_name.append(b)
        
        if args.auto == True:
            uniq_first_name = list(set(first_name))
            chunks = list(jpyh.splitlist(uniq_first_name, 10));
            print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(uniq_first_name)) + " unique.")
        else:
            chunks = list(jpyh.splitlist(first_name, 10));
            print("--- Read CSV with " + str(len(first_name)) + " first_name")

        print("--- Processed into " + str(len(chunks)) + " chunks")
        
        if jpyh.query_yes_no("\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no("---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        if args.auto == True:
            ofile = ofile + ".tmp"

        if "gender" not in headers:
            headers.append("gender")
            gender_index = headers.index("gender")
        if "probability" not in headers:
            headers.append("probability")
            prob_index = headers.index("probability")
        if "count" not in headers:
            headers.append("count")
            count_index = headers.index("count")

        response_time = [];
        gender_responses = list()
        with open(ofile, 'w', newline='', encoding="utf8") as f:
            writer = csv.writer(f)
            #writer.writerow(list(["first_name", "gender", "probability", "count"]))
            chunks_len = len(chunks)
            stopped = False

            #print(inputData)
            
            writer.writerow(headers)
            
            for index, chunk in enumerate(chunks):
                if stopped:
                    break
                success = False
                while not success:
                    try:
                        start = time.time()

                        if key_present:
                            dataset = genderize.get(chunk)
                        else:
                            dataset = Genderize().get(chunk)

                        gender_responses.append(dataset)
                        success = True
                    except GenderizeException as e:
                        #print("\n" + str(e))
                        logger.error(e)

                        #Error handling
                        if "response not in JSON format" in str(e) and args.catch == True:
                            if jpyh.query_yes_no("\n---!! 502 detected, try again?") == True:
                                success = False
                                continue
                        elif "Invalid API key" in str(e) and args.catch == True:
                            print("\n---!! Error, invalid API key! Check log file for details.\n")
                        else:
                            print("\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n")
                        stopped = True
                        break

                    response_time.append(time.time() - start)
                    print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                        str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")

                    #print(dataset)
                    
                    for data in dataset:
                        next_row = next(data_iterator)
                        next_row_index = inputData.index(next_row)
                        entry_row = inputData[next_row_index]

                        entry = [data["gender"], data["probability"], data["count"]]
                        entry_row += entry
                        writer.writerow(entry_row)
                    break

            if args.auto == True:
                print("\nCompleting identical first_name...\n")
                #AUTOCOMPLETE first_name

                #Create master dict
                gender_dict = dict()
                for response in gender_responses:
                    for d in response:
                        gender_dict[d.get("name")] = [d.get("gender"), d.get("probability"), d.get("count")]
                
                #names seen
                seen_names = []

                with open(ofilename + "_auto" + ofile_extension, 'w', newline='', encoding="utf8") as f:
                    writer = csv.writer(f)
                    writer.writerow(headers)
                    
                    for row in inputData:
                        if len(row) > 4:
                            name = row[1]
                            if name not in seen_names:
                                writer.writerow(row)
                            seen_names.append(name)
                        else:
                            name = row[0]
                            if name not in seen_names:
                                writer.writerow(row)
                            seen_names.append(name)
            
            if args.override == True:
                print("\nExercising override \n")

                with open(ofilename + "_override" + ofile_extension, 'w', newline='', encoding="utf8") as f:
                    writer = csv.writer(f)

                    #add headers
                    headers += ["female", "male"]
                    writer.writerow(headers)

                    for row in inputData:
                        gender = row[gender_index]
                        if gender == "male":
                            row += [0, 1]
                            writer.writerow(row)
                        else:
                            row += [1, 0]
                            writer.writerow(row)

            master_dict = dict()
            for response in gender_responses:
                for line in response:
                   master_dict[line.get("name")] = [line.get("gender"), line.get("probability"), line.get("count")]
            
            userResponse = True
            while userResponse == True:
                if jpyh.query_yes_no("Would you like to search the output file for the gender of a name") == False:
                    userResponse = False
                    sys.exit()
            
                sys.stdout.write("Enter a name you would like to search. \n")
                user_input = input()
                
                if user_input == '':
                    sys.stdout.write("You MUST enter a name \n")
                
                elif user_input not in master_dict:
                    sys.stdout.write("The name you entered is not in the output file \n")
                    
                    if jpyh.query_yes_no("Would you like to genderize {}?".format(user_input)) == False:
                        userResponse = False
                    
                    else:
                        if key_present:
                            api_response = genderize.get([user_input])
                        else:
                            api_response = Genderize().get([user_input])
                        for line in api_response:
                            name, gender, prob, count = line.get("name"), line.get("gender"), line.get("probability"), line.get("count")
                            print("Name: {}, Gender: {}, Probability: {}, Count: {}".format(name, gender, prob, count))

                else:
                    response = master_dict[user_input]
                    sys.stdout.write("Name: {}, Gender: {}, Probability: {}, Count: {}".format(user_input, response[0], response[1], response[2]))
                    if jpyh.query_yes_no("\nWould you like to search for another name?") == False:
                        userResponse = False

            print("Done!\n")
Exemplo n.º 4
0
def genderize(args):
    print(args)

    #File initialization
    dir_path = os.path.dirname(os.path.realpath(__file__))

    logging.basicConfig(filename=dir_path + os.sep + "log.txt", level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(name)s %(message)s')
    logger=logging.getLogger(__name__)

    ofilename, ofile_extension = os.path.splitext(args.output)

    ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    ifile = args.input

    if os.path.isabs(ifile):
        print("\n--- Input file: " + ifile)
    else:
        print("\n--- Input file: " + dir_path + os.sep + ifile)

    if os.path.isabs(ofile):
        print("--- Output file: " + ofile)
    else:
        print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    #File integrity checking
    if not os.path.exists(ifile):
        print("--- Input file does not exist. Exiting.\n")
        sys.exit()

    if not os.path.exists(os.path.dirname(ofile)):
        print("--- Error! Invalid output file path. Exiting.\n")
        sys.exit()

    #Some set up stuff
    ##csv.field_size_limit(sys.maxsize)

    #Initialize API key
    if not args.key == "NO_API":
        print("--- API key: " + args.key + "\n")
        genderize = Genderize(
            user_agent='GenderizeDocs/0.0',
            api_key=args.key)
        key_present = True
    else:
        print("--- No API key provided.\n")
        key_present = False

    # Modify this section to take into account what the user wants to use through the command line
    #Open ifile
    with open(ifile, 'r', encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True)
        first_name = []
        raw = []
        original_headers = [];
        is_override = False
        column_number = -1
        # we are attempting to override the column that the names are stored in
        
        # Easier to check a boolean than to constantly check if args.override is equal to 'NO_OVERRIDE'
        if args.override != 'NO_OVERRIDE':
            is_override = True
        
        for row in readCSV: #Read CSV into first_name list
            # if we are overriding the search column
            if is_override:
                # ugly nested mess but it works.
                
                # if we have not found the list position of the desired override column
                if column_number == -1:
                    # get the first row from the reader (assumed to be the first row)
                    first_name.append(row)
                    # also save this to the raw list for later use
                    raw.append(row)
                    # iterate through each item in the row we just saved and keep track of the for loop index
                    for index, column in enumerate(first_name[0]):
                        # if our column name is equal to the override name, we found the index number we need to proceed. Break from the loop
                        if column == args.override:
                            column_number = index
                            break
                        # error detection if the user override is not found in the header of the input csv.
                        if index == len(first_name[0])-1:
                            print("User Override '" + args.override + "' not found in input CSV file, Exiting...")
                            sys.exit()
                # Our column number should be found by now, so continue to import the specific data that we want.
                else:
                    # IMPORTANT: we need to remove all leading and trailing whitespaces to ensure that the genderizer responds with correct information
                    stripped = row[column_number].strip()
                    # append our stripped string onto the first_name list
                    first_name.append(stripped)
                    # save the entire row to the raw list
                    raw.append(row)
                    
                
                    
            # if no override, continue like normal
            else:
                first_name.append(row)
                

        # if we have a header, we need to remove it so it is not included in the submission
        if args.noheader == False:
            if is_override:
                    # Before we pop the first list item in first_name, save it to be our original headers so we can write them later
                    original_headers = first_name[0]
                    # We also need to pop the for item in the raw list or we will end up with extra data
                    raw.pop(0)
            first_name.pop(0) #Remove header


        o_first_name = list()  
        # We dont need to strip the first name list if we are overriding because it has already been taken care of
        if is_override:
            o_first_name = first_name
            
            
        # Removes the [''] on each list item so we just end up with names when iterating through the list
        else:
            for l in first_name:
                for b in l:
                    o_first_name.append(b)
                    
        # moved uniq_first_name outside of the if statement for later use.
        uniq_first_name = []
                    
        if args.auto == True:
            uniq_first_name = list(dict.fromkeys(o_first_name))
            chunks = list(jpyh.splitlist(uniq_first_name, 10));
            print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(uniq_first_name)) + " unique.")
        else:
            # splitting the name list into chunks of 10 due to api restrictions
            chunks = list(jpyh.splitlist(first_name, 10));
            print("--- Read CSV with " + str(len(first_name)) + " first_name")

        print("--- Processed into " + str(len(chunks)) + " chunks")
        

        if jpyh.query_yes_no("\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no("---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        if args.auto == True:
            ofile = ofile + ".tmp"

        response_time = [];
        gender_responses = list()
        with open(ofile, 'w', newline='', encoding="utf8") as f:
            
            writer = csv.writer(f)
            ## TODO Add new system to write all rows of the original file. Done
            # If we are overriding, we need to write different headers into the output csv file. We call the write_headers function to keep the code clean
            if is_override:
                write_headers(writer, original_headers)
            # else, continue as expected
            else:
                writer.writerow(list(["first_name","gender", "probability", "count"]))
            chunks_len = len(chunks)
            stopped = False
            for index, chunk in enumerate(chunks):
                if stopped:
                    break
                success = False
                while not success:
                    try:
                        start = time.time()

                        if key_present:
                            dataset = genderize.get(chunk)
                        else:
                            dataset = Genderize().get(chunk)

                        gender_responses.append(dataset)
                        
                        success = True
                    except GenderizeException as e:
                        #print("\n" + str(e))
                        logger.error(e)

                        #Error handling
                        if "response not in JSON format" in str(e) and args.catch == True:
                            if jpyh.query_yes_no("\n---!! 502 detected, try again?") == True:
                                success = False
                                continue
                        elif "Invalid API key" in str(e) and args.catch == True:
                            print("\n---!! Error, invalid API key! Check log file for details.\n")
                        else:
                            print("\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n")
                        stopped = True
                        break

                    response_time.append(time.time() - start)
                    print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                        str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")
            
            gender_dict = dict()
            
            # Moved this function out of the autocomplete function to allow us to use it for the non-autocomplete writing as well
            for response in gender_responses:
                    for d in response:
                        gender_dict[d.get("name")] = [d.get("gender"), d.get("probability"), d.get("count")]

            # we need to iterate over all of our "cleaned" first names 
            for index, name in enumerate(o_first_name):
                data = gender_dict.get(name)
                # If we are overriding, we need to print our raw data plus our genderize information.
                if is_override:
                    data_list = [data[0], data[1], data[2]]
                    writer.writerow(raw[index] + data_list)
                # If we are not overriding, we print the standard information
                else:
                    writer.writerow([name, data[0], data[1], data[2]])      

            # if we have the autocomplete enabled, we need to allow overriding in this mode as well.
            if args.auto == True:
                print("\nCompleting identical first_name...\n")
                
                filename, file_extension = os.path.splitext(ofile)
                with open(filename, 'w', newline='', encoding="utf8") as f:
                    writer = csv.writer(f)
                    # Before we enter the for loop, we need to print the correct headers into the output csv file.
                    # If we are overriding, we need to print out saved original headers as well as the new headers. We call our write_headers function to keep the code clean
                    if is_override:
                        write_headers(writer, original_headers)
                        # we need to remove duplicate items in our raw file for proper file writing.
                        raw_cleaned = remove_dupes(raw, column_number)
                    # If we are not overriding, we can print the standard headers.
                    else:
                        writer.writerow(list(["first_name","gender", "probability", "count"]))
                    # We need to iterate over our uniq_first_name list inorder to write the correct names
                    for index, name in enumerate(uniq_first_name):
                        # If we are overriding, we need to combine the data recieved from the genderize api and combine it with our clean raw list inorder to write the correct information
                        if is_override:
                            data = gender_dict.get(name)
                            data_list = [data[0], data[1], data[2]]
                            writer.writerow(raw_cleaned[index] + data_list)
                        # If we are not overriding, we can perform everything as expected.
                        else:
                            data = gender_dict.get(name)
                            writer.writerow([name, data[0], data[1], data[2]])

                    
                    
            print("Done!\n")
Exemplo n.º 5
0
def genderize():
    if len(sys.argv) != 3:
        print(
            "Plrease specify input and output files: python genderize.py [input file path] [output file path]"
        )
        sys.exit()

    dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    filename, file_extension = os.path.splitext(sys.argv[2])
    ofile = filename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    print("\n--- Input file: " + dir_path + os.sep + sys.argv[1])
    print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    csv.field_size_limit(sys.maxsize)

    #Initialize API key
    genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=API_KEY)

    with open(sys.argv[1], encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        names = []
        for row in readCSV:
            names.append(row)

        names.pop(0)

        print("--- Read CSV with " + str(len(names)) + " names")

        chunks = list(jpyh.splitlist(names, 10))

        print("--- Processed into " + str(len(chunks)) + " chunks")

        if jpyh.query_yes_no(
                "\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no(
                    "---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        response_time = []
        with open(ofile, 'w', newline='', encoding="utf8") as f:
            writer = csv.writer(f)
            writer.writerow(list(["names", "gender", "probability", "count"]))
            chunks_len = len(chunks)
            for index, chunk in enumerate(chunks):
                start = time.time()
                #while True:
                try:
                    dataset = genderize.get(chunk)
                    #break
                except GenderizeException as e:
                    print(e)
                    '''
                    if "response not in JSON format" in e:
                        if jpyh.query_yes_no("\n---! 502 detected, try again?") == True:
                            print("Exiting...\n")
                            pass
                        else:
                            break
                    '''
                    print(
                        "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key\n"
                    )
                    sys.exit()

                response_time.append(time.time() - start)
                print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \
                    str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s")

                for data in dataset:
                    writer.writerow(data.values())
Exemplo n.º 6
0
def genderize(args):
    print(args)

    # File initialization
    dir_path = os.path.dirname(os.path.realpath(__file__))

    logging.basicConfig(
        filename=dir_path + os.sep + "log.txt",
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s %(name)s %(message)s')
    logger = logging.getLogger(__name__)

    ofilename, ofile_extension = os.path.splitext(args.output)

    ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
    ifile = args.input

    if os.path.isabs(ifile):
        print("\n--- Input file: " + ifile)
    else:
        print("\n--- Input file: " + dir_path + os.sep + ifile)

    if os.path.isabs(ofile):
        print("--- Output file: " + ofile)
    else:
        print("--- Output file: " + dir_path + os.sep + ofile + "\n")

    # File integrity checking
    if not os.path.exists(ifile):
        print("--- Input file does not exist. Exiting.\n")
        sys.exit()

    if not os.path.exists(os.path.dirname(ofile)):
        print("--- Error! Invalid output file path. Exiting.\n")
        sys.exit()

    # Some set up stuff
    # csv.field_size_limit(sys.maxsize)

    # Initialize API key
    if not args.key == "NO_API":
        print("--- API key: " + args.key + "\n")
        genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=args.key)
        key_present = True
    else:
        print("--- No API key provided.\n")
        key_present = False

    # Open ifile
    with open(ifile, 'r', encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True)

        i = next(readCSV)  # set i to the headers as an array
        first_name = []
        allData = []

        if args.auto == False:  # add columns based off auto
            i.append('male')
            i.append('female')
        if args.auto == True:
            i.append('gender')
            i.append('probability')
            i.append('count')

        for row in readCSV:  # Read CSV into first_name list
            allData.append(row)
            first_name.append(row[1])  # specify the name column index here

        if args.noheader == False:
            first_name.pop(0)  # Remove header

        o_first_name = list()

        # Deleted this part of the code because it substrings the name column
        # for l in first_name:
        #     for b in l:
        #         o_first_name.append(b)
        # print(o_first_name)

        if args.auto == True:

            chunks = list(jpyh.splitlist(first_name, 10))

            print("--- Read CSV with " + str(len(first_name)) +
                  " first_name. " + str(len(first_name)) + " unique.")
        else:
            chunks = list(jpyh.splitlist(first_name, 10))
            print("--- Read CSV with " + str(len(first_name)) + " first_name")

        print("--- Processed into " + str(len(chunks)) + " chunks")

        if jpyh.query_yes_no(
                "\n---! Ready to send to Genderdize. Proceed?") == False:
            print("Exiting...\n")
            sys.exit()

        if os.path.isfile(ofile):
            if jpyh.query_yes_no(
                    "---! Output file exists, overwrite?") == False:
                print("Exiting...\n")
                sys.exit()
            print("\n")

        if args.auto == True:
            ofile = ofile + ".tmp"

        response_time = []
        gender_responses = list()
        with open(ofile, 'w', newline='', encoding="utf8") as f:
            writer = csv.writer(f)
            writer.writerow(i)  # write row header based off input file header
            chunks_len = len(chunks)
            stopped = False
            for index, chunk in enumerate(chunks):
                if stopped:
                    break
                success = False
                while not success:
                    try:
                        start = time.time()

                        if key_present:
                            dataset = genderize.get(chunk)
                        else:
                            dataset = Genderize().get(chunk)

                        gender_responses.append(dataset)
                        success = True
                    except GenderizeException as e:
                        #print("\n" + str(e))
                        logger.error(e)

                        # zzError handling
                        if "response not in JSON format" in str(
                                e) and args.catch == True:
                            if jpyh.query_yes_no(
                                    "\n---!! 502 detected, try again?"
                            ) == True:
                                success = False
                                continue
                        elif "Invalid API key" in str(
                                e) and args.catch == True:
                            print(
                                "\n---!! Error, invalid API key! Check log file for details.\n"
                            )
                        else:
                            print(
                                "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n"
                            )
                        stopped = True
                        break

                    response_time.append(time.time() - start)
                    print("Processed chunk " + str(index + 1) + " of " +
                          str(chunks_len) + " -- Time remaining (est.): " +
                          str(
                              round((sum(response_time) / len(response_time) *
                                     (chunks_len - index - 1)), 3)) + "s")

                    singleDataDict1 = dict()
                    newSingleDataDict = dict()
                    singleDataDictGen = dict()
                    singleDataDictData = dict()
                    newDataListWithGen = list()
                    allDataList = list()
                    addDataDict = {}
                    newDataList = list()
                    newArry = list()

                    # get each row and make a new dictionatry
                    for l in allData:
                        for n in range(0, len(l)):
                            singleDataDict1[n] = l[n]

                        dictionary_copy = singleDataDict1.copy()
                        allDataList.append(dictionary_copy)

                    # Combined the input file data with the gender api data
                    for m in allDataList:
                        for n in dataset:

                            if n['name'] == m[1]:  # specify name column here
                                singleDataDictGen = n
                                singleDataDictData = m
                                singleDataDictData.update(singleDataDictGen)

                                # delete the name key because of duplication
                                del singleDataDictData['name']

                                newDataListWithGen.append(singleDataDictData)

                    for newData in newDataListWithGen:
                        # add gender binary at the end of the data dictionary
                        if newData['gender'] == 'female' and newData[
                                'probability'] > 0.5:
                            newData.update({'male': 0, 'female': 1})
                        elif newData['gender'] == 'male' and newData[
                                'probability'] > 0.5:
                            newData.update({'male': 1, 'female': 0})
                        # since this is without argument auto delete gender, probability, and count
                        del newData['gender']
                        del newData['probability']
                        del newData['count']
                        # write the data column
                        writer.writerow(newData.values())
                    break

            if args.auto == True:
                print("\nCompleting identical first_name...\n")
                # AUTOCOMPLETE first_name

                # Create master dict
                gender_dict = dict()
                singleDataDict = dict()
                singleDataDict1 = dict()
                newSingleDataDict = dict()
                singleDataDictGen = dict()
                singleDataDictData = dict()
                newDataListWithGen = list()
                allDataList = list()
                addDataDict = {}
                newDataList = list()
                newArry = list()

                # get each row and make a new dictionatry
                for l in allData:
                    for n in range(0, len(l)):

                        singleDataDict1[n] = l[n]

                    dictionary_copy = singleDataDict1.copy()
                    allDataList.append(dictionary_copy)

                # Combined the input file data rows with the gender api data rows
                for m in allDataList:
                    for n in dataset:

                        if n['name'] == m[1]:  # specify name column here
                            singleDataDictGen = n
                            singleDataDictData = m
                            singleDataDictData.update(singleDataDictGen)

                            del singleDataDictData['name']

                            newDataListWithGen.append(singleDataDictData)

                filename, file_extension = os.path.splitext(ofile)
                with open(filename, 'w', newline='', encoding="utf8") as f:
                    writer = csv.writer(f)

                    writer.writerow(i)  # write the header column

                    for newData in newDataListWithGen:
                        # write the data column
                        writer.writerow(newData.values())

            print("Done!\n")