Пример #1
0
def create_app(configfile='mmctl.conf'):
    app = Flask(__name__)
    app.version = '0.2.0'
    app.config.from_object(defaults)

    try:
        conffile = os.path.join(app.instance_path, 'mmctl.conf')
        app.config.from_pyfile(conffile)
    except IOError:
        # load configuration blueprint
        from cfgutil import cfgutil
        import random
        app.register_blueprint(cfgutil)

        # generate initial salt
        if None == app.config['PBKDF2_SALT']:
            app.config['PBKDF2_SALT'] = \
                '%x' % random.SystemRandom().getrandbits(96)
    else:
        # load api/ui blueprint
        from mmctlui import mmctlui
        app.register_blueprint(mmctlui)

        # load slice file, this could be done dynamicall later on
        app.meta = load_meta(app, app.config['ICE_STRING'])

    # check if there is a configuration file, if not, run config blueprint
    AssetEnvironment(app)

    return app
def get_XY_vectors():
    meta, id_to_idx, idx_to_id = utils.load_meta(chosen_meta)
    all_answers = get_answers_list(meta)

    Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
    x = [extract_features_from_body(text) for post_id,text in utils.fetch_posts(chosen) if post_id in all_answers]
    X = np.asarray(x)
    return X,Y
def get_photons(amount=1):
    print('amount = ', amount)
    alldata = load_meta(kind='pt_outputs', amount=amount)
    cur_seg, pred_seg_res, cur_data, _, trainbool, astro_dict = alldata[
        -amount]
    del alldata

    metrics = get_bin_measures(cur_seg, pred_seg_res, sum=False)
    true_pos, false_neg, false_pos, true_neg = np.sum(metrics, axis=1)
    print(astro_dict)
    print(trainbool)
    print(
        confusion_matrix(false_neg, true_pos, true_neg, false_pos,
                         true_neg + false_pos, true_pos + false_neg))

    all_photons = np.concatenate((cur_data[metrics[0]], cur_data[metrics[1]],
                                  cur_data[metrics[2]], cur_data[metrics[3]]),
                                 axis=0)
    star_photons = np.concatenate((cur_data[metrics[1]], cur_data[metrics[3]]),
                                  axis=0)
    planet_photons = np.concatenate(
        (cur_data[metrics[0]], cur_data[metrics[2]]), axis=0)

    return all_photons, star_photons, planet_photons
Пример #4
0
# from sklearn.model_selection import KFold
from sklearn import neighbors
from data import chosen, chosen_meta
from utils import plot_roc, plot_pr
from utils import plot_feat_importance
from utils import load_meta
from utils import fetch_posts
from utils import plot_feat_hist
from utils import plot_bias_variance
from utils import plot_k_complexity

start_time = time.time()

# question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]}
# scores will be added on-the-fly as the are not in meta
meta, id_to_idx, idx_to_id = load_meta(chosen_meta)

import nltk

# splitting questions into train (70%) and test(30%) and then take their
# answers
all_posts = list(meta.keys())
all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]
all_answers = [q for q, v in meta.items() if v['ParentId'] != -1]  # [:500]

feature_names = np.array(
    ('NumTextTokens', 'NumCodeLines', 'LinkCount', 'AvgSentLen', 'AvgWordLen',
     'NumAllCaps', 'NumExclams', 'NumImages'))

# activate the following for reduced feature space
"""
Пример #5
0
    # 初始化路径
    save_root = os.path.join(opt.checkpoint_dir, opt.tag)
    log_root = os.path.join(opt.log_dir, opt.tag)

    utils.try_make_dir(save_root)
    utils.try_make_dir(log_root)

    # dataloader
    train_dataloader = train_dataloader
    val_dataloader = val_dataloader

    # 初始化日志
    logger = init_log(training=True)

    # 初始化训练的meta信息
    meta = load_meta(new=True)
    save_meta(meta)

    # 初始化模型
    Model = get_model(opt.model)
    model = Model(opt, logger)

    # 暂时还不支持多GPU
    # if len(opt.gpu_ids):
    #     model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids)
    model = model.to(device=opt.device)

    if opt.load:
        load_epoch = model.load(opt.load)
        start_epoch = load_epoch + 1 if opt.resume else 1
    else:
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import KFold
from sklearn import neighbors

from data import chosen, chosen_meta
from utils import plot_roc, plot_pr
from utils import plot_feat_importance
from utils import load_meta
from utils import fetch_posts
from utils import plot_feat_hist
from utils import plot_bias_variance
from utils import plot_k_complexity

# question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]}
# scores will be added on-the-fly as the are not in meta
meta, id_to_idx, idx_to_id = load_meta(chosen_meta)

import nltk

# splitting questions into train (70%) and test(30%) and then take their
# answers
all_posts = list(meta.keys())
all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]
all_answers = [q for q, v in meta.items() if v['ParentId'] != -1]  # [:500]

feature_names = np.array((
    'NumTextTokens',
    'NumCodeLines',
    'LinkCount',
    'AvgSentLen',
    'AvgWordLen',
Пример #7
0
def main():

    import optparse

    p = optparse.OptionParser(usage='%prog [options] <src_dir> <db_path>')

    opts, args = p.parse_args()

    src_dir, db_path = map(Path, args)
    assert src_dir.is_dir()
    assert not db_path.exists()

    Session, db_engine = make_session_and_engine(db_path)
    Base.metadata.create_all(db_engine)
    session = Session()

    title_regexp = re.compile(
        r'\s*(\d+)\s*(?:-|–)\s*(?:\w+),?.?\s*(?:“|«|\")?\s*((?:\'|`|[^\d\W])+\s*(?:`|[^\d\W])+)\s*(?:\d*)\s*(?:”|»|\")?',
        re.UNICODE)

    BISMILLAH_LITERALS = {
        'Мээримдүү, Ырайымдуу Аллахтын аты менен',
        'Ырайымдуу, Мээримдүү Аллахтын аты менен'
    }

    surah_infos = load_meta()

    note_id_func = FT.partial(next, infinite_numbers_gen())

    def link_positions(text) -> T.Tuple[str, T.List[T.Tuple[int, int]]]:
        """
            'Some{1} text with links{2} in it{3}!' -> ('Some text with links in it!', [(4, 1), (20, 2), (26, 3)])
        """
        chunks = re.split(
            r'(\{\d+\})', text
        )  # ['Some', '{1}', ' text with links', '{2}', ' in it', '{3}', '!']

        text_chunks = []
        links = []
        for i, ch in enumerate(chunks):
            if i % 2 == 0:
                text_chunks.append(ch)
            else:
                # (index, link_id)
                links.append((sum(map(len, text_chunks)), int(ch[1:-1])))

        return ''.join(text_chunks), links

    for findex, p in enumerate(
            sorted(src_dir.glob('suro_*.html'),
                   key=lambda i: int(''.join(ch for ch in i.name
                                             if ch.isdigit())))):

        with p.open() as f:
            soup = BeautifulSoup(f.read(), features='lxml')

            # note_links = soup.find_all('a', {'class': 'sdfootnoteanc'})
            # assert len(soup.find_all('div', {'class': 'hidden'})) == 1
            notes_container = soup.select_one('div.hidden')

            def get_link_content(link_el):
                content_el = notes_container.select_one('div{} > p'.format(
                    link_el['href'][:-3]))
                # assert len(content_el.find_all('a')) == 1
                content_el.select_one('a').decompose()
                # assert len(content_el.find_all('sup')) == 1, link_el['href']
                for el in content_el.select('sup'):
                    el.decompose()
                return ' '.join(content_el.text.split())

            # title parsing
            title_container = soup.find('div', {'class': 'title-cont'})
            main_title = title_container.select_one(
                'div.title-parent > div.title-center')
            # assert len(main_title.select('a')) in (0, 1), p.name
            title_note_link = main_title.select_one('a')
            # assert not title_note_link or title_note_link['name'] != '_GoBack'
            title_note_content = get_link_content(
                title_note_link) if title_note_link else None
            # print(p, repr(title_note_content))
            # continue

            title = main_title.text
            suro_num_literal, title = title_regexp.match(title).groups()
            title = ' '.join(title.split())
            surah_num = int(suro_num_literal)
            # print(surah_num, title)

            surah_info = surah_infos[surah_num - 1]

            raw_kek = [
                i.text for i in title_container.find_all('p', recursive=False)
            ]
            kek = [
                ' '.join(i.split()).rstrip('.!') for i in raw_kek
                if 'бөлүм' not in i and i.strip()
            ]

            has_bismillah_pre = BISMILLAH_LITERALS.intersection(kek)
            assert surah_num not in (1, 9) or not has_bismillah_pre, kek
            assert surah_num in (1, 9) or has_bismillah_pre, kek

            kek = [i for i in kek if i not in BISMILLAH_LITERALS]

            assert len(kek) in (0, 1), (p, surah_num, title, kek)
            (info, ) = kek
            assert '.' in info
            revelation_place, ayat_number_text = [
                i.strip().capitalize() for i in info.split('.')
            ]
            assert {
                'Меккеде': 'Makkah',
                'Мединада': 'Madinah'
            }[revelation_place.split()[0]] == surah_info['revelation_place']
            # print('\t', revelation_place, ayat_number_text)
            revelation_place = revelation_place.split()[0][:-2]

            # todo: check ayat_number_text
            ayattan_turat_literal = 'айаттан турат'
            assert ayat_number_text.endswith(ayattan_turat_literal), repr(
                ayat_number_text)
            ayat_number_text = ayat_number_text[:-len(ayattan_turat_literal
                                                      )].rstrip()

            ayat_number = ky_number_text_to_int(ayat_number_text)

            assert ayat_number == surah_info['verses_count'], (
                ayat_number_text, '->', ayat_number, '!=',
                surah_info['verses_count'])
            assert int_to_ky_text(ayat_number).capitalize(
            ) == ayat_number_text, (int_to_ky_text(ayat_number).capitalize(),
                                    ayat_number_text)
            # print(ky_to_int_vals)

            # content parsing
            content_lists = soup.find_all('ol')
            # content_lines = soup.find_all('li')
            content_lines = IT.chain.from_iterable(
                ol.find_all('li') for ol in content_lists)
            # assert len(list(content_lines)) == len(soup.find_all('li'))

            content_lines = list(content_lines)
            # contains_ayah_number_regexp = re.compile(r'^\d+\.', re.UNICODE)
            # assert not any(contains_ayah_number_regexp.match(l) for l in content_lines)
            cll = len(content_lines)

            assert cll == ayat_number, (cll, ayat_number)

            def process_content_line(el):
                link_contents = []
                i = 0
                for link in el.find_all('a', {'class': 'sdfootnoteanc'}):
                    link_cont = get_link_content(link)
                    if link_cont:
                        link_contents.append(link_cont)
                        link.replace_with('{{{}}}'.format(i))
                        i += 1
                    else:
                        print('\tempty note:', surah_num, title, link['href'])
                        link.decompose()
                text = ' '.join(el.text.split())
                text, notes = link_positions(text)
                return text, [(str_index, link_contents[i])
                              for str_index, i in notes]

            verses = list(map(process_content_line, content_lines))

            surah = Surah(
                number=surah_info['number'],
                title=title,
                verses_count=ayat_number,
                revelation_place=revelation_place,
                chronological_order=surah_info['chronological_order'],
                bismillah_pre=bool(has_bismillah_pre),
                title_note=title_note_content,
            )

            def make_verse(index, verse_text, notes):
                verse = Verse(number=index + 1, text=verse_text)
                verse.notes = [
                    Note(text_position=str_index, text=text)
                    for str_index, text in notes
                ]
                return verse

            surah.verses = [
                make_verse(i, t, n) for i, (t, n) in enumerate(verses)
            ]

            session.add(surah)
            session.commit()

            print(surah)
            assert surah.id == surah_info[
                'number'] == surah.number == findex + 1
Пример #8
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dirs = args.inputdirs
    tf = args.factor
    valid_chroms = args.validchroms
    valid_input_dirs = args.validinputdirs
    test_chroms = args.testchroms
    epochs = args.epochs
    patience = args.patience
    learningrate = args.learningrate
    seed = args.seed
    utils.set_seed(seed)
    dropout_rate = args.dropout
    L = args.seqlen
    w = args.motifwidth
    utils.L = L
    utils.w = w
    utils.w2 = w / 2
    negatives = args.negatives
    assert negatives > 0
    meta = args.meta
    gencode = args.gencode
    motif = args.motif

    num_motifs = args.kernels
    num_recurrent = args.recurrent
    num_dense = args.dense

    features = ['bigwig']

    if tf:
        print 'Single-task training:', tf
        singleTask = True
        if meta:
            print 'Including metadata features'
            features.append('meta')
        if gencode:
            print 'Including genome annotations'
            features.append('gencode')
    else:
        print 'Multi-task training'
        singleTask = False
        #Cannot use any metadata features
        assert not meta
        assert not gencode

    if args.outputdir is None:
        clobber = True
        output_dir = args.outputdirc
    else:
        clobber = False
        output_dir = args.outputdir

    try:  # adapted from dreme.py by T. Bailey
        os.makedirs(output_dir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print >> sys.stderr, (
                    'output directory (%s) already exists '
                    'but you specified not to clobber it') % output_dir
                sys.exit(1)
            else:
                print >> sys.stderr, ('output directory (%s) already exists '
                                      'so it will be clobbered') % output_dir

    print 'Loading genome'
    genome = utils.load_genome()
    if valid_input_dirs:
        print 'You specified at least one validation input directory'
        assert singleTask  # This option only works for single-task training
    print 'Loading ChIP labels'
    if singleTask:
        chip_bed_list, nonnegative_regions_bed_list = \
            utils.load_chip_singleTask(input_dirs, tf)
        if valid_input_dirs:
            valid_chip_bed_list, valid_nonnegative_regions_bed_list = \
                utils.load_chip_singleTask(valid_input_dirs, tf)
        num_tfs = 1
    else:
        assert len(
            input_dirs) == 1  # multi-task training only supports one cell line
        input_dir = input_dirs[0]
        tfs, positive_windows, y_positive, nonnegative_regions_bed = \
            utils.load_chip_multiTask(input_dir)
        num_tfs = len(tfs)
    print 'Loading bigWig data'
    bigwig_names, bigwig_files_list = utils.load_bigwigs(input_dirs)
    num_bigwigs = len(bigwig_names)
    if valid_input_dirs:
        valid_bigwig_names, valid_bigwig_files_list = utils.load_bigwigs(
            valid_input_dirs)
        assert valid_bigwig_names == bigwig_names
    if not singleTask:
        bigwig_files = bigwig_files_list[0]
    if meta:
        print 'Loading metadata features'
        meta_names, meta_list = utils.load_meta(input_dirs)
        if valid_input_dirs:
            valid_meta_names, valid_meta_list = utils.load_load(
                valid_input_dirs)
            assert valid_meta_names == meta_names
    else:  # meta option was not selected, pass empty metadata features to the functions
        meta_list = [[] for bigwig_files in bigwig_files_list]
        if valid_input_dirs:
            valid_meta_list = [[] for bigwig_files in valid_bigwig_files_list]

    print 'Making features'
    if singleTask:
        if not valid_input_dirs:  #validation directories not used, must pass placeholder values
            valid_chip_bed_list = None
            valid_nonnegative_regions_bed_list = None
            valid_bigwig_files_list = None
            valid_meta_list = None
        datagen_train, datagen_valid = \
            utils.make_features_singleTask(chip_bed_list,
            nonnegative_regions_bed_list, bigwig_files_list, bigwig_names,
            meta_list, gencode, genome, epochs, negatives, valid_chroms, test_chroms,
            valid_chip_bed_list, valid_nonnegative_regions_bed_list,
            valid_bigwig_files_list, valid_meta_list)
    else:
        datagen_train, datagen_valid = \
            utils.make_features_multiTask(positive_windows, y_positive,
            nonnegative_regions_bed, bigwig_files, bigwig_names,
            genome, epochs, valid_chroms, test_chroms)
    print 'Building model'
    if num_recurrent == 0:
        print 'You specified 0 LSTM units. Omitting BLSTM layer'
    if num_recurrent < 0:
        print 'You specified less than 0 LSTM units. Replacing BLSTM layer with global max-pooling layer'
    if meta or gencode:
        num_meta = 0
        if meta:
            num_meta = len(meta_names)
        if gencode:
            num_meta += 6
        model = utils.make_meta_model(num_tfs, num_bigwigs, num_meta,
                                      num_motifs, num_recurrent, num_dense,
                                      dropout_rate)
    else:
        model = utils.make_model(num_tfs, num_bigwigs, num_motifs,
                                 num_recurrent, num_dense, dropout_rate)

    if motif:
        assert singleTask  # This option only works with single-task training
        motifs_db = utils.load_motif_db('resources/HOCOMOCOv9.meme')
        if tf in motifs_db:
            print 'Injecting canonical motif'
            pwm = motifs_db[tf]
            pwm += 0.001
            pwm = pwm / pwm.sum(axis=1)[:, np.newaxis]
            pwm = np.log2(pwm / 0.25)
            utils.inject_pwm(model, pwm)
    output_tf_file = open(output_dir + '/chip.txt', 'w')
    if singleTask:
        output_tf_file.write("%s\n" % tf)
    else:
        for tf in tfs:
            output_tf_file.write("%s\n" % tf)
    output_tf_file.close()
    output_feature_file = open(output_dir + '/feature.txt', 'w')
    for feature in features:
        output_feature_file.write("%s\n" % feature)
    output_feature_file.close()
    output_bw_file = open(output_dir + '/bigwig.txt', 'w')
    for bw in bigwig_names:
        output_bw_file.write("%s\n" % bw)
    output_bw_file.close()
    if meta:
        output_meta_file = open(output_dir + '/meta.txt', 'w')
        for meta_name in meta_names:
            output_meta_file.write("%s\n" % meta_name)
        output_meta_file.close()
    model_json = model.to_json()
    output_json_file = open(output_dir + '/model.json', 'w')
    output_json_file.write(model_json)
    output_json_file.close()
    train(datagen_train, datagen_valid, model, epochs, patience, learningrate,
          output_dir)