示例#1
0
def random_sampling(taxids):
    train_taxids = random.sample(taxids, 200)
    train_gis = [
        random.choice(gi_list)
        for gi_list in [getGiByTaxid(taxid) for taxid in train_taxids]
        if gi_list != []
    ]
    grp_size = 20
    final_train_gis = []
    grped_train_gis = [
        train_gis[i:i + grp_size] for i in range(0, len(train_gis), grp_size)
    ]
    for gi_list in grped_train_gis:
        try:
            ids_list = ",".join([str(gi) for gi in gi_list])
            records = ncbi_data.get(ncbi_data.get_gis_pos, ids_list)
            i = 0
            for record in records:
                rlen = record['Length']
                if rlen <= ref_read_size:
                    random_pos = 0
                    read_size = rlen
                else:
                    random_pos = random.randint(0, rlen - ref_read_size)
                    read_size = ref_read_size
                final_train_gis.append([gi_list[i], random_pos, read_size])
                i += 1
        except RuntimeError as e:
            logger.error("GI not found in database: %s", str(e))
    return final_train_gis
示例#2
0
def random_sampling(taxids):
    train_taxids = random.sample(taxids, 200)
    train_gis = [random.choice(gi_list) for gi_list
                    in [getGiByTaxid(taxid) for taxid in train_taxids]
                    if gi_list != []]
    grp_size = 20
    final_train_gis = []
    grped_train_gis = [train_gis[i:i+grp_size] for i in range(0,len(train_gis),grp_size)]
    for gi_list in grped_train_gis:
        try:
            ids_list = ",".join([str(gi) for gi in gi_list])
            records = ncbi_data.get(ncbi_data.get_gis_pos, ids_list)
            i = 0
            for record in records:
                rlen = record['Length']
                if rlen <= ref_read_size:
                    random_pos = 0
                    read_size = rlen
                else:
                    random_pos = random.randint(0, rlen - ref_read_size)
                    read_size = ref_read_size
                final_train_gis.append([gi_list[i], random_pos, read_size])
                i += 1
        except RuntimeError as e:
            logger.error("GI not found in database: %s", str(e))
    return final_train_gis
示例#3
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'Download reference sequences from NCBI datasets and feed the vw classifier(s)'
    )
    parser.add_argument(
        '--pattern_port',
        dest='pattern_port_file',
        type=str,
        required=True,
        help=
        'Tabular file listing for each daemon: initial_model, pattern, port and destination repertory. See config/model_pattern_port_dest.txt for an example'
    )
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='For detailed output')
    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO

    logger.setLevel(log_level)
    ch = logging.StreamHandler()
    ch.setLevel(log_level)
    formatter = logging.Formatter("%(asctime)s (%(levelname)s) - %(message)s",
                                  "%Y-%m-%d %H:%M:%S")
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    logger.info("PREPROCESS")

    with open(args.pattern_port_file, 'r') as input:
        pattern_port_dest_list = [[
            info for info in line.split("\n")[0].split("\t")[1:] if info
        ] for line in input]
    logger.debug("Parsing pattern_port file for [pattern, port, dest]")
    logger.debug(pattern_port_dest_list)
    pattern_port_dict = defaultdict(list)
    port_dest_dict = {}
    for pattern_port in pattern_port_dest_list:
        pattern_port_dict[pattern_port[0]].append(pattern_port[1])
        port_dest_dict[pattern_port[1]] = pattern_port[2]

    with open("pattern_port_dict.pickle", 'wb') as output:
        pickle.dump(pattern_port_dict, output)
    with open("port_dest_dict.pickle", 'wb') as output:
        pickle.dump(port_dest_dict, output)

    random.seed()
    train_gis = [[]] * 4
    cpt = 0

    logger.info("Getting taxids...")
    with open("taxids.pickle", 'rb') as input:
        taxids = pickle.load(input)

    logger.info("TRAINING")
    while True:
        for k in categories.keys():
            train_gis[k] = random_sampling(taxids[k])

        min_size = min([len(l) for l in train_gis])
        cpt += min_size
        logger.debug([len(l) for l in train_gis])
        train_gis = [l[:min_size] for l in train_gis]
        logger.debug(train_gis)

        for i in range(min_size):
            for cat in categories.keys():
                (gi, pos, read_size) = train_gis[cat][i]
                logger.debug((cat, gi, pos, read_size))
                seq = ncbi_data.get(ncbi_data.get_gis_slice_seq, gi, pos,
                                    pos + read_size - 1)
                logger.debug(seq)
                for pattern in pattern_port_dict.keys():
                    logger.debug(pattern)
                    features = kmerize(seq, pattern, ref_read_size)
                    vw_feat = vw_daemon.print_vw_features(
                        int(gi), features, cat + 1)
                    logger.debug(pattern_port_dict[pattern])
                    logger.debug(vw_feat)
                    for port in pattern_port_dict[pattern]:
                        vw_daemon.send_to_daemon(port, vw_feat)
            time.sleep(sleep_delay)
        logger.info(cpt)

    return 0
示例#4
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Download reference sequences from NCBI datasets and feed the vw classifier(s)')
    parser.add_argument('--pattern_port', dest='pattern_port_file', type=str, required=True, 
                        help='Tabular file listing for each daemon: initial_model, pattern, port and destination repertory. See config/model_pattern_port_dest.txt for an example')
    parser.add_argument('--debug', dest='debug', action='store_true', 
                        help='For detailed output')
    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO

    logger.setLevel(log_level)
    ch = logging.StreamHandler()
    ch.setLevel(log_level)
    formatter = logging.Formatter("%(asctime)s (%(levelname)s) - %(message)s",
                                  "%Y-%m-%d %H:%M:%S")
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    logger.info("PREPROCESS")

    with open(args.pattern_port_file, 'r') as input:
        pattern_port_dest_list = [[info for info in line.split("\n")[0].split("\t")[1:] if info]
                                  for line in input]
    logger.debug("Parsing pattern_port file for [pattern, port, dest]")
    logger.debug(pattern_port_dest_list)
    pattern_port_dict = defaultdict(list)
    port_dest_dict = {}
    for pattern_port in pattern_port_dest_list:
        pattern_port_dict[pattern_port[0]].append(pattern_port[1])
        port_dest_dict[pattern_port[1]] = pattern_port[2]

    with open("pattern_port_dict.pickle", 'wb') as output:
        pickle.dump(pattern_port_dict, output)
    with open("port_dest_dict.pickle", 'wb') as output:
        pickle.dump(port_dest_dict, output)

    random.seed()
    train_gis = [[]]*4
    cpt = 0

    logger.info("Getting taxids...")
    with open("taxids.pickle", 'rb') as input:
        taxids = pickle.load(input)

    logger.info("TRAINING")
    while True:
        for k in categories.keys():
            train_gis[k] = random_sampling(taxids[k])
                
        min_size = min([len(l) for l in train_gis])
        cpt += min_size
        logger.debug([len(l) for l in train_gis])
        train_gis = [l[:min_size] for l in train_gis]
        logger.debug(train_gis)

        for i in range(min_size):
            for cat in categories.keys():
                (gi, pos, read_size) = train_gis[cat][i]
                logger.debug((cat, gi, pos, read_size))
                seq = ncbi_data.get(ncbi_data.get_gis_slice_seq, gi, pos, pos+read_size-1)
                logger.debug(seq)
                for pattern in pattern_port_dict.keys():
                    logger.debug(pattern)
                    features = kmerize(seq, pattern, ref_read_size)
                    vw_feat = vw_daemon.print_vw_features(int(gi), features, cat + 1)
                    logger.debug(pattern_port_dict[pattern])
                    logger.debug(vw_feat)
                    for port in pattern_port_dict[pattern]:
                        vw_daemon.send_to_daemon(port, vw_feat)
            time.sleep(sleep_delay)
        logger.info(cpt)

    return 0