def getInfoAboutStance(stance="All"): if stance == "All": d = ptd.getMetaDataAsList() data = pd.DataFrame(d) else: data = getStanceData(stance) print("SHOWING INFORMATION FOR THE SELECTED STANCE: '{}'\n".format(stance)) print("Data consist of {} records".format(len(data))) print print 80*"-" print("TITLE INFO") getTitleInfo(data.Title) print 80*"-" + "\n\n" print 80*"-" print("YEAR INFO") getPublicationYear(data.Publication_year) print 80*"-" + "\n\n" print 80*"-" print("LANGUAGE INFO") getLanguage(data.Language) print 80*"-" + "\n\n" print 80*"-" print("REFERENCE INFO") getRefs(data.References) print 80*"-" + "\n\n" print 80*"-" print("ORGANIZATION INFO") getOrgInfo(data.Organization_info) print 80*"-" + "\n\n" print 80*"-" print("PUBLICATION LENGTH") getPubLength(data.Publication_length) print 80*"-" + "\n\n" print 80*"-" print("AUTHOR INFO") getAuthor(data.Authors) print 80*"-" + "\n\n" print 80*"-" print("HEADER AND SUB-HEADER INFO ") print 80*"-" getHeader(data.Headers, data.Sub_headers) print 80*"-" + "\n\n" print 80*"-" print("SUBJECT INFO") print 80*"-" getSubject(data.Subjects)
def storeLanguageToJson(stance="All"): if stance == "All": d = ptd.getMetaDataAsList() frame = pd.DataFrame(d).Language else: frame = getStanceData(stance).Language frame.fillna("nan", inplace=True) lang = frame.tolist() lang = [l for l in lang if l != "nan"] uniq = list(set(lang)) print("There are {} different languages".format(len(uniq))) print("These languages are:\n{}".format(uniq)) d = dict(zip(uniq, np.zeros(len(uniq)))) for l in lang: d[l] += 1 for key in d.keys(): print("{} \t: {}".format(key, d[key])) with open("../TextFiles/meta_data/lang_NONE.json", "w") as f: json.dump(d, f)
def getAllData(): data = ptd.getMetaDataAsList() return pd.DataFrame(data)
def getStanceData(stance): d = ptd.getMetaDataAsList() data = pd.DataFrame(d) return data[data.Stance == stance]
def getCount(stance="AGAINST"): base = "../DataProcessing/TopicModelling/abstract/" #base = "../DataProcessing/TopicModelling/title/" type_ = "abstract_" # "abstract_" or "title_" with open(base + type_ + stance+ "_count.pkl") as f: file1 = pickle.load(f) with open(base + type_ + stance +"_tfidf.pkl") as f: file2 = pickle.load(f) data = file1 + file2 uniq_words = list(set([word[1] for word in data])) print("# of uniq words in {} topic is: {}".format(stance, len(uniq_words))) l = ptd.getMetaDataAsList() df = pd.DataFrame(l) favor = df[df.Stance == "FAVOR"] none = df[df.Stance == "NONE"] against = df[df.Stance == "AGAINST"] t_favor = list(set([word for sublist in favor.Title.tolist() for word in sublist.split(" ")])) t_none = list(set([word for sublist in none.Title.tolist() for word in sublist.split(" ")])) t_against = list(set([word for sublist in against.Title.tolist() for word in sublist.split(" ")])) print("Length of unique words in {} is {}.\nLength of unique words in {} is {}.\nLength of unique words in {} is {}".format("AGAINST", len(t_against), "FAVOR", len(t_favor), "NONE", len(t_none))) print counter = 0 if stance == "AGAINST": for title in t_favor: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from FAVOR found in {} is {}\n".format(stance, counter)) counter = 0 for title in t_none: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from NONE found in {} is {}".format(stance, counter)) elif stance == "FAVOR": for title in t_against: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from AGAINST found in {} is {}\n".format(stance, counter)) counter = 0 for title in t_none: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from NONE found in {} is {}".format(stance, counter)) else: for title in t_against: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from AGAINST found in {} is {}\n".format(stance, counter)) for title in t_favor: for word in title.split(" "): if word in uniq_words: counter += 1 print("Number of title words from FAVOR found in {} is {}".format(stance, counter))