if not os.path.exists(sourcedir + docid + '.tsv'): continue docs.append(row['volid']) logistic.append(float(row['logistic'])) dates.append(float(row['dateused'])) logistic = np.array(logistic) dates = np.array(dates) numdocs = len(docs) categories = dict() for field in fields: categories[field] = np.zeros(numdocs) wordcounts = filecab.get_wordcounts(sourcedir, '.tsv', docs) for i, doc in enumerate(docs): ctcat = Counter() allcats = 0 for word, count in wordcounts[doc].items(): allcats += count for field in fields: if word in inquirer[field]: ctcat[field] += count for field in fields: categories[field][i] = ctcat[field] / (allcats + 1) logresults = [] dateresults = []
logistic = dict() realclass = dict() titles = dict() dates = dict() with open('../metadata/prestigeset.csv', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: logistic[row['volid']] = float(row['logistic']) realclass[row['volid']] = row['prestige'] titles[row['volid']] = row['title'] dates[row['volid']] = int(row['dateused']) sourcedir = '../sourcefiles/' documents = filecab.get_wordcounts(sourcedir, '.tsv', set(logistic)) outrows = [] for docid, doc in documents.items(): if docid not in logistic: continue else: allwords = 1 colorct = 0 for word, count in doc.items(): allwords += count if word in colors: colorct += count