def smallFileReport(sizes): os.chdir(outPath) filenames = ["{}{}small.csv".format(userBase, ii) for ii in sizes] for ff in filenames: df = readHuge(ff) #.drop(dropcols,axis=1).set_index("id") print(ff) print(df.shape) print(df.groupby("userid")["clean_text"].count())
def selectUsernames(sizes, numUsers): os.chdir(outPath) filenames = ["{}{}.csv".format(userBase, ii) for ii in sizes] maxFile = readHuge(filenames[-1]) usernames = maxFile["userid"].sample(numUsers) #These users must be in all files out = os.path.join(outPath, "userSample{}.csv".format(numUsers)) usernames.to_csv(out)
def chk(): os.chdir(outPath) output = [] for file in os.listdir(outPath): df = readHuge(file) output.append((file, df.shape)) for out in output: print("{}: {}".format(*out))
def baseDF(file, out=None): filepath = os.path.join(path, file) df = readHuge(filepath).set_index("id") try: df = df.drop(dropcols, axis=1) except: pass checkPrint(df.shape, out) checkPrint(df.groupby("userid")["clean_text"].count(), out) return df
def buildUserFiles(mainFile, cut=0.5): ''' Assemble the files for testing UserX = X tweets per user ''' report = open("report.txt", 'w') file = readHuge(mainFile) #.drop(dropcols,axis=1) index = file["userid"].unique().tolist() cutoff = int(len(index) * cut) report.write("Base: {} users with {} Tweets total\n".format( len(index), file.shape[0])) cnt = 0 safe = True while cnt < 1000 and safe: cnt += 25 dfs = [] hold = [] dropids = [] dropuser = [] for uid in tqdm(index): tweets = file[file["userid"] == uid] try: picked = tweets.sample(cnt) dfs.append(picked) except: hold.append("\tUser {} did not have {} Tweets\n".format( uid, cnt)) dropids.extend(tweets.index.tolist()) dropuser.append(uid) result = pd.concat(dfs) outfile = "{}{}.csv".format(userBase, cnt) out = os.path.join(outPath, outfile) report.write("{} has {} users and {} Tweets\n".format( outfile, len(result["userid"].unique()), result.shape[0])) for hh in hold: report.write(hh) file.drop(dropids, inplace=True) for dd in dropuser: index.remove(dd) if len(index) < cutoff: #Less than half the users now safe = False report.write("\n") result.to_csv(out) print("{} created".format(outfile)) report.close()
def makeSmallFiles(sizes, numUsers, sampleSize=None): if sampleSize is None: sampleSize = numUsers os.chdir(outPath) nameFile = "userSample{}.csv".format(numUsers) names = pd.read_csv(nameFile, header=0, index_col=0)["userid"].sample(sampleSize) names = names.tolist() filenames = ["{}{}.csv".format(userBase, ii) for ii in sizes] for ff in filenames: df = readHuge(ff).drop(dropcols, axis=1).set_index("id") sample = df.loc[df["userid"].isin(names), :] outname = ff.replace(".csv", "small.csv") sample.to_csv(outname)
def main(file, one, two, threes, params): filepath = os.path.join(path, file) df = readHuge(filepath).set_index("id").drop(dropcols, axis=1) outputs = [] for idx, cv in enumerate(threes): #print(cv.get_params()) pipe = Pipeline([("vect", one), ("tfidf", two), ("clf", cv)], verbose=True) parameters = {**params[0], **params[1], **params[idx + 2]} xx = df.loc[:, "clean_text"] yy = df.loc[:, "userid"] xx, yy = shuffle(xx, yy) outputs.append(gridSearch(xx, yy, pipe, parameters, False)) with open("gridSearchResults.txt", "w") as outFile: for oo in outputs: outFile.write(oo) outFile.write("\n------------------------------------\n")
def baseDF(file): filepath = os.path.join(path, file) df = readHuge(filepath).set_index("id") #.drop(dropcols,axis=1) checkPrint(df.shape) checkPrint(df.groupby("userid")["clean_text"].count()) return df