def bootstrap_keyphrases_solicitations(): db = connect_to_arango() graph = get_technode_graph(db) nlp = English() col_documents = db.col(SOLICITATION_COLLECTION) docs_lite = [] for document in col_documents.all(): doc_lite = {} doc_lite['_id'] = document['_id'] if 'keywords' in document: doc_lite['keywords'] = document['keywords'] if 'description' in document: doc_lite['description'] = document['description'] docs_lite.append(doc_lite) # FIXME: creating mini-array rather than just straight-iterating over cursor # doing it this way to avoid cursor timeout issues, may no longer be needed in arango 3 for document in docs_lite: if 'description' in document: process_doc_for_keyphrases(nlp, db, graph, KEYPHRASE_SOLICITATION_RELATIONS, document['description'], document['_id']) if 'keywords' in document: store_keyphrases_for_doc(db, graph, KEYPHRASE_SOLICITATION_RELATIONS, document['keywords'], document['_id'], vetted=True)
def associate_solicitation_contract(): db = connect_to_arango() graph = get_technode_graph(db) contracts = db.col('contracts') solicitations = db.col('solicitations') # for each solicitation for solicitation in solicitations.all(): # find all the contracts that use the same topic code related_contracts = contracts.get_by_example({'topic_code':solicitation['topic']}) time.sleep(0.1) for contract in related_contracts: graph.create_edge("solicitation_contract_relations", {"_from": solicitation['_id'], "_to": contract['_id']})
def bootstrap_keyphrases_contracts(): db = connect_to_arango() graph = get_technode_graph(db) nlp = English() col_documents = db.col(CONTRACT_COLLECTION) docs_lite = [] for document in col_documents.all(): doc_lite = {} doc_lite['_id'] = document['_id'] if 'keywords' in document: doc_lite['keywords'] = document['keywords'] if 'abstract' in document: doc_lite['abstract'] = document['abstract'] docs_lite.append(doc_lite) for document in docs_lite: process_doc_for_keyphrases(nlp, db, graph, KEYPHRASE_CONTRACT_RELATIONS, document['abstract'], document['_id']) if 'keywords' in document: store_keyphrases_for_doc(db, graph, KEYPHRASE_CONTRACT_RELATIONS, document['keywords'], document['_id'])
def bootstrap_keyphrases_r2s(): db = connect_to_arango() graph = get_technode_graph(db) nlp = English() col_documents = db.col(R2_COLLECTION) docs_lite = [] for document in col_documents.all(): doc_lite = {} doc_lite['_id'] = document['_id'] if 'program_desc' in document: doc_lite['program_desc'] = document['program_desc'] if 'projects' in document: doc_lite['projects'] = document['projects'] docs_lite.append(doc_lite) # FIXME: creating mini-array rather than just straight-iterating over cursor # doing it this way to avoid cursor timeout issues, may no longer be needed in arango 3 for document in docs_lite: if 'program_desc' in document: process_doc_for_keyphrases(nlp, db, graph, KEYPHRASE_R2_RELATIONS, document['program_desc'], document['_id']) if 'projects' in document: for project in document['projects']: if 'accomp_planned' in document['projects'][project]: # accomps are stored as a list, turn it into a long string. # some are 0-length; for whatever reason if len(document['projects'][project] ['accomp_planned']) > 0: accomp_str = "".join( document['projects'][project]['accomp_planned']) process_doc_for_keyphrases(nlp, db, graph, KEYPHRASE_R2_RELATIONS, accomp_str, document['_id']) if 'mission_desc' in document['projects'][project]: process_doc_for_keyphrases( nlp, db, graph, KEYPHRASE_R2_RELATIONS, document['projects'][project]['mission_desc'], document['_id'])
def get_term_collection(): db = connect_to_arango() return db.col('tech_terms')
def to_arango(self, colname='r2_exhibits'): db = connect_to_arango() graph = get_technode_graph(db) graph.create_vertex(colname, self)
q = Queue('r2_load', connection=Redis()) for file in r2_files: if file.lower().endswith(".xml"): q.enqueue(r2file_to_arango, file) #print "Loading %s" % (file) def load_solicitation_listings(): sque = Queue('solicitation_load', connection=Redis()) listings = load_listings_from_file('/home/brian/technodeminer/local_listings.json') print "Enqueing solicitation load tasks" for listing in listings: #print "Adding %s to job q" % (listing['ComponentURL']) sque.enqueue(listing_to_graph, listing, graph, db) def get_listings_from_web(): return ListingReader() if __name__ == '__main__': db = connect_to_arango() build_collections() graph = get_technode_graph(db) # Load SBIR Contracts before Solicitations print "Loading contracts" load_sbir_contracts(graph) print "Enqueuing solicitation jobs" load_solicitation_listings() print "Enqueuing r2 jobs" load_r2_exhibits()
from technodeminer.persistence.graph import connect_to_arango, get_technode_graph from pattern.vector import Model, Document db = connect_to_arango() graph = get_technode_graph(db) r2_exhibits = db.col('r2_exhibits') solicitations = db.col('solicitations') # for each solicitation #for solicitation in solicitations.all(): # constrain to air force FY16 solicit_gen = solicitations.get_by_example({"Component": "Air Force", "Fiscal Year": "FY16"}) r2_queries = [{"byear": 2013, "agency": "Air Force", "ba_num": 1}, {"byear": 2013, "agency": "Air Force", "ba_num": 2}, {"byear": 2013, "agency": "Air Force", "ba_num": 3}] r2_2013 = {"byear": 2013} # find all the contracts that use the same topic code r2_list = [] for query in r2_queries: for r2 in r2_exhibits.get_by_example(query): try: strings = [r2['program_desc']] projects = [r2['projects'] for k in r2['projects'].keys()] for proj in projects: try: strings.append(proj['mission_desc']) except KeyError as e: pass