def extract_citations(): """ Extract citations from the papers, based on the title of other papers """ titles = dict((p.title.strip(), p.id) for p in Papers.select(Papers) if len(p.title.strip().split(' ')) > 1) for paper in Papers.select(): citations = [ titles[t] for t in list(titles.keys()) if t in paper.paper_text and titles[t] != paper.id ] for citation in citations: create_citation(paper.id, citation) print("Paper {paper_id}".format(paper_id=paper.id)) print(citations)
def get_documents() -> list: """ Given the documents in the option parser return the body of these documents Returns: list of corpus of bodies """ paper = Papers.select().where(Papers.id == options.document).get() return paper.paper_text
def main(): papers = Papers.select().limit(LIMIT) n_citations = [] for index, p in enumerate(papers[BEGIN:]): print(p.id - 1) n_citations.append(scrape_citation_count(p)) if n_citations[index] == -1: print("Finished after {} iterations".format(index)) break time.sleep(2) citations_to_csv(n_citations) print(n_citations)
def main(): if not os.path.isfile('ldamodel.pkl'): papers = [p.paper_text for p in Papers.select().order_by(fn.Random()).limit(200)] ldamodel, dictionary = train_classifier(papers, 20) pickle.dump(ldamodel, open('ldamodel.pkl', 'wb')) pickle.dump(dictionary, open('dictionary.pkl', 'wb')) else: ldamodel = pickle.load(open('ldamodel.pkl', 'rb')) dictionary = pickle.load(open('dictionary.pkl', 'rb')) if not os.path.isfile('labels.txt'): topic_labels = extract_topics(ldamodel) with open('labels.txt', 'w') as f: f.write("\n".join(topic_labels)) else: with open('labels.txt', 'r') as f: topic_labels = f.read().splitlines() create_database_labels(topic_labels) print(topic_labels) label_documents(ldamodel, topic_labels, dictionary)
def label_documents(model: LdaModel, topic_labels: list, dictionary: corpora.Dictionary): """ Labels the documents in the database including the documents not present in the training set Args: model: the trained ldaModel topic_labels: list with labels of the topics dictionary: dictionary that is used when scanning the training data """ papers = Papers.select() for paper in papers: text, title, paper_id = paper.paper_text, paper.title, paper.id cleaned_text = clean(text) text_dict = dictionary.doc2bow(cleaned_text) topic_scores = model[text_dict] labels = [] for topic_score in topic_scores: if topic_score[1] > 1/len(topic_scores): Papers_labels.get_or_create(paper_id=paper_id, label_id=topic_score[0]) labels.append(topic_labels[topic_score[0]]) print(title, labels)
def main(): papers = Papers.select().limit(100) labels, data = [p.title for p in papers], [p.paper_text for p in papers] data_feat = extract_features(data) distance_matrix = create_distance_matrix(data_feat) cluster(distance_matrix, labels)
def main(): for paper in Papers.select().limit(10): citations = re.split('[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee]([Ss])?', paper.paper_text)[-1] print(citations.split('\n'))