def main_loop():
    db_connect()
    record_list = get_live_record_list(92613)
    info_col = get_info_col(db.mongo_client)
    print("Record list count: {}".format(len(record_list)))
    for record in record_list:
        if not info_col.count_documents({"rid": record.rid}):
            print("Start to crawling record {}".format(record.rid))
            try:
                save_record(record.rid)
            except Exception as e:
                print("Fail in crawling record, rid is {}".format(record.rid))
Exemplo n.º 2
0
def db_ssea_import(ssea_dir, matrix_dir, name, host):
    sample_sets_json_file = os.path.join(ssea_dir,
                                         'sample_set.json')
    ss_name = json.loads(open(sample_sets_json_file).read())['name']
    colls = db_connect(name, host)
    ss = colls['sample_sets']
    ss_check = ss.find_one({'name':ss_name})
    if ss_check != None:
        logging.info('Sample set: \'%s\' already in database' % ss_name)
    else: 
        logging.info('importing sample set \'%s\' to %s database on mongo server: %s' % (ss_name, name, host))
        
        
        
#         logging.debug("Importing sample_set file")
        #get ss_id by checking current number of ss in database
        ss_id = str(ss.count()) 
        _ssea_path = ssea.__path__[0]
        _merge_path = os.path.join(_ssea_path, 'utils/mongo_ssea_printJSON.py')
        p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-s'], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['mongoimport', '-c', 'sample_sets', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout)
        p1.wait()
        p2.wait()
        
        ss.update({'_id':int(ss_id)}, {'$set':{'name':'TMP'}})
        
#         logging.info("Importing config file")
        p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-c'], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['mongoimport', '-c', 'configs', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout)
        p1.wait()
        p2.wait()
        
#         logging.info("Importing results file")
        p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-r'], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['mongoimport', '-c', 'results', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout)
        p1.wait()
        p2.wait()
        
#         logging.info("Importing hists file")
        p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '--hist'], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['mongoimport', '-c', 'hists', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout)
        p1.wait()
        p2.wait()
        
#         logging.info("Creating merge collection")
        _merge_path = os.path.join(_ssea_path, 'utils/mongo_merge_printJSON.py')
        p1 = subprocess.Popen(['python', _merge_path, '--ss_id', ss_id, '--host', host, '--name', name], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['mongoimport', '-c', 'merged', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout)
        p1.wait()
        p2.wait()
        
        ss.update({'_id':int(ss_id)}, {'$set':{'name':ss_name}})
        
            
        logging.info("Finished importing \'%s\'" % ss_name)
Exemplo n.º 3
0
def db_delete_ss(name, host, ss_id):
    colls = db_connect(name, host)
    sample_sets = colls['sample_sets']
    configs = colls['configs']
    results = colls['results']
    hists = colls['hists']
    merged = colls['merged']
    
    ss_name = sample_sets.find_one({'_id':ss_id})['name']
    
    #check for incomplete imports
    logging.info('Removing sample set \'%s\'' % ss_name)
    
    sample_sets.remove({'_id': ss_id})
    configs.remove({'_id': ss_id})
    hists.remove({'_id': ss_id})
    results.remove({'ss_id': ss_id})
    merged.remove({'ss_id': ss_id})
Exemplo n.º 4
0
def db_delete_ss(name, host, ss_id):
    colls = db_connect(name, host)
    sample_sets = colls['sample_sets']
    configs = colls['configs']
    results = colls['results']
    hists = colls['hists']
    merged = colls['merged']

    ss_name = sample_sets.find_one({'_id': ss_id})['name']

    #check for incomplete imports
    logging.info('Removing sample set \'%s\'' % ss_name)

    sample_sets.remove({'_id': ss_id})
    configs.remove({'_id': ss_id})
    hists.remove({'_id': ss_id})
    results.remove({'ss_id': ss_id})
    merged.remove({'ss_id': ss_id})
Exemplo n.º 5
0
def db_repair(name, host):
    colls = db_connect(name, host)
    sample_sets = colls['sample_sets']
    configs = colls['configs']
    results = colls['results']
    hists = colls['hists']
    merged = colls['merged']

    #check for incomplete imports
    logging.info('Removing incomplete imports')
    tmps = sample_sets.find({'name': 'TMP'})
    tmp_ids = []
    for ss in tmps:
        tmp_ids.append(ss['_id'])

    sample_sets.remove({'_id': {'$in': tmp_ids}})
    configs.remove({'_id': {'$in': tmp_ids}})
    hists.remove({'_id': {'$in': tmp_ids}})
    results.remove({'ss_id': {'$in': tmp_ids}})
    merged.remove({'ss_id': {'$in': tmp_ids}})
Exemplo n.º 6
0
def db_repair(name, host):
    colls = db_connect(name, host)
    sample_sets = colls['sample_sets']
    configs = colls['configs']
    results = colls['results']
    hists = colls['hists']
    merged = colls['merged']
    
    #check for incomplete imports
    logging.info('Removing incomplete imports')
    tmps = sample_sets.find({'name':'TMP'})
    tmp_ids = []
    for ss in tmps: 
        tmp_ids.append(ss['_id'])
    
    sample_sets.remove({'_id': {'$in': tmp_ids}})
    configs.remove({'_id': {'$in': tmp_ids}})
    hists.remove({'_id': {'$in': tmp_ids}})
    results.remove({'ss_id': {'$in': tmp_ids}})
    merged.remove({'ss_id': {'$in': tmp_ids}})
Exemplo n.º 7
0
def main(argv=None):
    '''Command line options.'''
    # create instance of run configuration

    # Setup argument parser
    parser = argparse.ArgumentParser()
    # Add command line parameters
    parser.add_argument("-n",
                        "--name",
                        dest='name',
                        default='compendia',
                        help='name for ssea run (will be name of database)')
    parser.add_argument("--host",
                        dest='host',
                        default='localhost:27017',
                        help='name of mongodb server to connect to')
    parser.add_argument("--ss_id",
                        dest='ss_id',
                        help='mongo _id number of sample set being merged')
    # Process arguments
    args = parser.parse_args()
    # setup logging

    level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    colls = db_connect(args.name, args.host)
    transcripts = colls['transcripts']
    results = colls['results']

    #parse through transcript metadata and build dict to be used during merge
    trans_dict = collections.defaultdict(lambda: {})
    logging.info('Parsing through transcript metadata to prepare merge')
    tot = transcripts.find().count()
    i = 0
    for x in transcripts.find():
        i += 1
        if (i % 50000) == 0:
            logging.debug('Finished %d/%d' % (i, tot))

        key = x['_id']
        #create a dict placeholder for this _id element
        id_dict = {}
        for field in fields_trans:
            id_dict[field] = x[field]
        #create a combined locus and strand field
        locus = x['locus']
        strand = x['strand']
        new_loc = locus + '(' + strand + ')'
        id_dict['loc_strand'] = new_loc
        trans_dict[key] = id_dict

    ss_id = int(args.ss_id)
    #print merged json
    tot = results.find({'ss_id': ss_id}).count()
    logging.info(
        'Merging transcript metadata and results fields (%d total merged documents)'
        % tot)
    fields_results.append('_id')
    for x in results.find({'ss_id': ss_id}):
        #create another dict placeholder to be printed as JSON
        dict = {}
        for field in fields_results:
            if field == '_id':
                dict[field] = str(x[field])
            else:
                dict[field] = x[field]
        id = x['t_id']
        trans_meta = trans_dict[id]

        for key in trans_meta.iterkeys():
            dict[key] = trans_meta[key]
        print json.dumps(dict)
Exemplo n.º 8
0
def main(argv=None):
    '''Command line options.'''    
    # create instance of run configuration
    
    # Setup argument parser
    parser = argparse.ArgumentParser()
    # Add command line parameters
    parser.add_argument("-n", "--name", dest = 'name',
                        default = 'compendia',
                        help = 'name for ssea run (will be name of database)')
    parser.add_argument("--host", dest = 'host',
                        default = 'localhost:27017',
                        help = 'name of mongodb server to connect to')
    parser.add_argument("--ss_id", dest = 'ss_id',
                        help = 'mongo _id number of sample set being merged')
    # Process arguments
    args = parser.parse_args()
    # setup logging
    
    level = logging.DEBUG
    logging.basicConfig(level=level,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    
    
    colls = db_connect(args.name, args.host)
    transcripts = colls['transcripts']
    results = colls['results']

    
    #parse through transcript metadata and build dict to be used during merge
    trans_dict = collections.defaultdict(lambda: {})
    logging.info('Parsing through transcript metadata to prepare merge')
    tot = transcripts.find().count()
    i = 0
    for x in transcripts.find():
        i+=1
        if (i % 50000) == 0:
            logging.debug('Finished %d/%d' % (i, tot))
            
        key = x['_id']
        #create a dict placeholder for this _id element
        id_dict = {}
        for field in fields_trans:
            id_dict[field] = x[field]
        #create a combined locus and strand field
        locus = x['locus']
        strand = x['strand']
        new_loc = locus + '(' + strand + ')'
        id_dict['loc_strand'] = new_loc
        trans_dict[key] = id_dict
    
    ss_id = int(args.ss_id)
    #print merged json
    tot = results.find({'ss_id':ss_id}).count()
    logging.info('Merging transcript metadata and results fields (%d total merged documents)' % tot)
    fields_results.append('_id')
    for x in results.find({'ss_id':ss_id}):
        #create another dict placeholder to be printed as JSON
        dict = {}
        for field in fields_results: 
            if field == '_id':
                dict[field] = str(x[field])
            else: 
                dict[field] = x[field]
        id = x['t_id']
        trans_meta = trans_dict[id]
        
        for key in trans_meta.iterkeys():
            dict[key] = trans_meta[key]
        print json.dumps(dict)