# Author: Xu Wang # Email: [email protected] # # Filename: retrieve_publications.py # # from database import Publications from journal import * PublicationsTable = Publications() # retrieve journals # publication_type = 'Journals & Magazines' # for number, name in all_journal_titles(): # print(name, number) # PublicationsTable.add_publication_title_type_number(name, # publication_type, # number) # retrieve conferences publication_type = 'Conferences' for number, name in all_conference_titles(): print(name, number) PublicationsTable.add_publication_title_type_number(name, publication_type, number)
'authors', 'terms'] with open(filename, 'ta', newline='') as fp: tsv_writer = csv.writer(fp, delimiter='\t') # tsv_writer.writerow(fields) try: print('get publication urls', ptitle) urls = journal_articles_requests_urls(ptitle, articles_per_request=1000) for url in urls: try: print('reading --', url) paper_dicts = retrieve_documents_from_url(url) paperlist = [] for pd in paper_dicts: p = Paper(ptitle, pd) paperlist.append(p) paper_str_list = [] for p in paperlist: paper_str_list.append(p.to_list()) tsv_writer.writerows(paper_str_list) except Exception as e: print(str(e)) except Exception as e: print(str(e)) if __name__ == '__main__': publications_table = Publications() for ptitle in publications_table.get_all_publications_title(): retrieve_papers_to_tsv(ptitle)
def mp_retriever(inqueue): """ :param inqueue: :return: """ while True: ptitle, ptype = inqueue.get() if ptitle is None: return else: retrieve_papers_to_tsv(ptitle, ptype) if __name__ == '__main__': publications_table = Publications() workers = 128 processes = [] title_queue = Queue() for ptitle, ptype in publications_table.get_all_titles_numbers(): # print(ptype, ptitle) # retrieve_papers_to_tsv(ptitle) title_queue.put((ptitle, ptype)) for i in range(workers): title_queue.put((None, None)) for i in range(workers): p = Process(target=mp_retriever, args=(title_queue, )) p.start() processes.append(p)