def calculate_popularity(file_path): references_count = Counter() for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print("cp %d" % idx) references_count[ref.cited_id] += 1 return references_count
def count_paragraph_titles_for_citations(file_path): id_titles_dict = defaultdict(Counter) #e.g. {"1233242":{"introduction":1234. "background":123}} for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print(idx) title = ref.title.lower() id_titles_dict[ref.cited_id][title] += 1 return id_titles_dict
def find_cits_with_sentiment_words(file_path): ret = set() for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print(idx) tokens = set(smart_tokenize(ref.get_sentence())) intersect = tokens.intersection(sentiment_words) if len(intersect) <> 0: ret.add(ref, list(intersect)) return ret
def count_words_in_context(file_path, popular_pmids): word_count = defaultdict(Counter) for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print(idx) if ref.cited_id not in popular_pmids: continue citing_sentence = ref.get_sentence(0) tokens = smart_tokenize(citing_sentence) word_count[ref.cited_id] += Counter(tokens) return word_count