def smallFileReport(sizes):
    os.chdir(outPath)
    filenames = ["{}{}small.csv".format(userBase, ii) for ii in sizes]
    for ff in filenames:
        df = readHuge(ff)  #.drop(dropcols,axis=1).set_index("id")
        print(ff)
        print(df.shape)
        print(df.groupby("userid")["clean_text"].count())
def selectUsernames(sizes, numUsers):
    os.chdir(outPath)
    filenames = ["{}{}.csv".format(userBase, ii) for ii in sizes]
    maxFile = readHuge(filenames[-1])

    usernames = maxFile["userid"].sample(numUsers)
    #These users must be in all files
    out = os.path.join(outPath, "userSample{}.csv".format(numUsers))
    usernames.to_csv(out)
def chk():
    os.chdir(outPath)
    output = []
    for file in os.listdir(outPath):
        df = readHuge(file)
        output.append((file, df.shape))

    for out in output:
        print("{}: {}".format(*out))
Пример #4
0
def baseDF(file, out=None):
    filepath = os.path.join(path, file)
    df = readHuge(filepath).set_index("id")
    try:
        df = df.drop(dropcols, axis=1)
    except:
        pass
    checkPrint(df.shape, out)
    checkPrint(df.groupby("userid")["clean_text"].count(), out)
    return df
def buildUserFiles(mainFile, cut=0.5):
    '''
    Assemble the files for testing
    UserX = X tweets per user
    '''
    report = open("report.txt", 'w')
    file = readHuge(mainFile)  #.drop(dropcols,axis=1)
    index = file["userid"].unique().tolist()
    cutoff = int(len(index) * cut)

    report.write("Base: {} users with {} Tweets total\n".format(
        len(index), file.shape[0]))

    cnt = 0
    safe = True
    while cnt < 1000 and safe:
        cnt += 25
        dfs = []
        hold = []
        dropids = []
        dropuser = []

        for uid in tqdm(index):
            tweets = file[file["userid"] == uid]
            try:
                picked = tweets.sample(cnt)
                dfs.append(picked)
            except:
                hold.append("\tUser {} did not have {} Tweets\n".format(
                    uid, cnt))
                dropids.extend(tweets.index.tolist())
                dropuser.append(uid)

        result = pd.concat(dfs)
        outfile = "{}{}.csv".format(userBase, cnt)
        out = os.path.join(outPath, outfile)
        report.write("{} has {} users and {} Tweets\n".format(
            outfile, len(result["userid"].unique()), result.shape[0]))
        for hh in hold:
            report.write(hh)

        file.drop(dropids, inplace=True)
        for dd in dropuser:
            index.remove(dd)

        if len(index) < cutoff:  #Less than half the users now
            safe = False

        report.write("\n")
        result.to_csv(out)
        print("{} created".format(outfile))
    report.close()
def makeSmallFiles(sizes, numUsers, sampleSize=None):
    if sampleSize is None:
        sampleSize = numUsers

    os.chdir(outPath)
    nameFile = "userSample{}.csv".format(numUsers)
    names = pd.read_csv(nameFile, header=0,
                        index_col=0)["userid"].sample(sampleSize)
    names = names.tolist()

    filenames = ["{}{}.csv".format(userBase, ii) for ii in sizes]
    for ff in filenames:
        df = readHuge(ff).drop(dropcols, axis=1).set_index("id")
        sample = df.loc[df["userid"].isin(names), :]
        outname = ff.replace(".csv", "small.csv")
        sample.to_csv(outname)
Пример #7
0
def main(file, one, two, threes, params):
    filepath = os.path.join(path, file)
    df = readHuge(filepath).set_index("id").drop(dropcols, axis=1)

    outputs = []
    for idx, cv in enumerate(threes):
        #print(cv.get_params())

        pipe = Pipeline([("vect", one), ("tfidf", two), ("clf", cv)],
                        verbose=True)

        parameters = {**params[0], **params[1], **params[idx + 2]}

        xx = df.loc[:, "clean_text"]
        yy = df.loc[:, "userid"]
        xx, yy = shuffle(xx, yy)

        outputs.append(gridSearch(xx, yy, pipe, parameters, False))

    with open("gridSearchResults.txt", "w") as outFile:
        for oo in outputs:
            outFile.write(oo)
            outFile.write("\n------------------------------------\n")
Пример #8
0
def baseDF(file):
    filepath = os.path.join(path, file)
    df = readHuge(filepath).set_index("id")  #.drop(dropcols,axis=1)
    checkPrint(df.shape)
    checkPrint(df.groupby("userid")["clean_text"].count())
    return df