f = open("processed_data/mId2Genre.txt", "w") genreIdx = Indexer() for idx, row in movies.iterrows(): mId, raw_genres = row['mId'], row['genres'] raw_genres = raw_genres.replace("\'", "\"") genres_l = json.loads(raw_genres) f.write("%d %d" % (mId, len(genres_l))) for g in genres_l: f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base)) f.write("\n") f.close() f = open("processed_data/Genre2Id.txt", "w") num_genres = len(genreIdx) for i in range(num_genres): f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i))) f.close() id_base += num_genres ''' create credits mId2CC.txt: 45476 lines each line includes (mId, num of crew/casts, cIds) ''' credits = readCreditData(args, tmid2mid) print("credits.shape %s" % (str(credits.shape))) cIdx = Indexer() f = open("processed_data/mId2CC.txt", "w") for idx, row in credits.iterrows(): mId, raw_cast, raw_crew = row['mId'], row['cast'], row['crew'] cast_l = ast.literal_eval(raw_cast) crew_l = ast.literal_eval(raw_crew) attr = []
for idx, word in enumerate(words): if word_cnts[word] <= 10: words[idx] = "" line = " ".join(word for word in words) if line.isspace() or line == "": count_of_bad += 1 else: data.text = line new_dataset.append(data) if i % 200000 == 0: print("iterated", i, "cleaned datapoints") emoji_sample_count = {} for dp in new_dataset: emoji_sample_count[indexer.get_object(dp.label)] = emoji_sample_count.get(indexer.get_object(dp.label), 0) + 1 print ("emoji_sample count ", emoji_sample_count) shuffle(new_dataset) sample_dataset = [] emoji_sample_counter = {} if sys.argv[2] == "Sample": for dp in new_dataset: if dp.label in emoji_sample_counter: if emoji_sample_counter[dp.label] < 50000: sample_dataset.append(dp) emoji_sample_counter[dp.label] += 1 else: emoji_sample_counter[dp.label] = 1