Пример #1
0
    def test_unit_bytes_per_second(self):
        ft = Featurizer()

        ratio_1 = ft.bytes_per_second(0, 0)
        self.assertEqual(0, ratio_1)
        ratio_2 = ft.bytes_per_second(5, 0)
        self.assertEqual(5, ratio_2)
        ratio_3 = ft.bytes_per_second(5, 5)
        self.assertEqual(1, ratio_3)
Пример #2
0
    def featurize_data(self, data, models):
        logging.warning('featurizing train...')
        f = Featurizer(self.conf)
        sample, labels = f.featurize(data, models)
        self.labels = array(labels)

        # get word pairs and headers
        self.header, self.words = f.convert_to_wordpairs(sample)

        logging.info('HEADERS: {0}'.format(self.header))
        # print [s.features for s in sample]
        logging.info('converting table...')
        self.data = f.convert_to_table(sample)
        logging.info('data shape: {0}'.format(self.data.shape))
        logging.info('labels shape: {0}'.format(self.labels.shape))
        self.feats = f._feat_order
def worker_task(files, args, worker_id):
    featurizer = Featurizer()
    cooc = {}
    Counter = 0  # Number of articles processed
    file_count = 0
    for inputfile in files:
        file_name, _ = splitext(basename(inputfile))
        output_file = open(join(args.outputpath, file_name + ".cooc"), 'wb')
        sys.stdout.write("{}: Processing file:{}\n".format(
            worker_id, inputfile))
        dic = {}
        with open(inputfile, "r") as F:
            read = 0
            while True:
                try:
                    text = F.read(4096)
                    if len(text) == 0:
                        break
                    read += len(text)
                    process_features(text, featurizer, cooc, args.window_size)
                except:
                    break
                print "read:{}/{}, {}%".format(read, 1E9,
                                               read / float(1E9) * 100)

        #dump the gram info
        dump_cooc_to_file(worker_id, cooc, output_file)

        sys.stdout.write("{}: Finished processing file:{}\n".format(
            worker_id, inputfile))
        file_count += 1

        # clear up
        del dic
Пример #4
0
    def test_unit_featurizer(self):
        ft = Featurizer()

        features = '0,0,0,0,1.0'

        key1 = '00:00:00:00:00:01_00:00:00:00:00:02_6_2'
        key2 = '00:00:00:00:00:02_00:00:00:00:00:01_6_2'
        key3 = '00:00:00:00:00:02_00:00:00:00:00:03_6_2'

        flow_dict = {}
        flow_dict[key1] = "00:00:00:00:00:01,00:00:00:00:00:02,6,2,8,8,0,480,480,0,1,0,1\n"
        flow_dict[key2] = "00:00:00:00:00:02,00:00:00:00:00:01,6,2,8,8,0,480,480,0,1,0,1\n"
        flow_dict[key3] = "00:00:00:00:00:02,00:00:00:00:00:03,6,2,8,8,0,480,480,0,1,0,1\n"

        stat = ['00:00:00:00:00:01','00:00:00:00:00:02','6','2','0','0','0','0','0','0','1','0','0']
        self.assertEqual(features,ft.featurizer(stat, flow_dict))
Пример #5
0
def worker_task(files, args, worker_id):
    labeler = FeatureLabelerHungry()
    featurizer = Featurizer(labeler=labeler)
    Counter = 0  # Number of articles processed
    file_count = 0
    for inputfile in files:
        sys.stdout.write("{}: Processing file:{}\n".format(
            worker_id, inputfile))
        dic = {}
        with open(inputfile, "r") as F:
            # All articles begin with '<doc' and end with '</doc>'
            # for line in F:
            # 	if line.startswith("<doc"):
            # 		continue
            # 	if line.startswith("</doc>"):
            # 		# some paragraph ends
            # 		Counter += 1
            # 		continue

            read = 0
            it = 0
            while True:
                try:
                    # filter_with_alphabet(sanitize_line(F.read(1024)), args.alphabet)
                    text = F.read(4096)
                    if len(text) == 0:
                        break
                    read += len(text)
                    features = featurizer.featurize(text)
                    labeler.increment_features(features)
                except:
                    break
                print "read:{}/{}, {}%".format(read, 1E9,
                                               read / float(1E9) * 100)

        #dump the gram info
        file_name, _ = splitext(basename(inputfile))
        output_file = join(args.outputpath, file_name + ".fcount")
        labeler.dump(output_file)

        sys.stdout.write("{}: Finished processing file:{}\n".format(
            worker_id, inputfile))
        file_count += 1
        # clear up
        del dic
Пример #6
0
    def test_unit_pair_flow_ratio(self):
        ft = Featurizer()

        key1 = '00:00:00:00:00:01_00:00:00:00:00:02_6_2'
        key2 = '00:00:00:00:00:02_00:00:00:00:00:01_6_2'
        key3 = '00:00:00:00:00:02_00:00:00:00:00:03_6_2'

        flow_dict = {}
        flow_dict[key1] = "00:00:00:00:00:01,00:00:00:00:00:02,6,2,8,8,0,480,480,0,20,0,1\n"
        flow_dict[key2] = "00:00:00:00:00:02,00:00:00:00:00:01,6,2,8,8,0,480,480,0,10,0,1\n"
        flow_dict[key3] = "00:00:00:00:00:02,00:00:00:00:00:03,6,2,8,8,0,480,480,0,5,0,1\n"

        stat = ['00:00:00:00:00:01','00:00:00:00:00:02','6','2','8','8','0','480','480','0','20','0','1\n']
        stat1 = ['00:00:00:00:00:02','00:00:00:00:00:03','6','2','8','8','0','480','480','0','5','0','1\n']

        ratio_1 = ft.pair_flow_ratio(stat, flow_dict)
        self.assertEqual(2.0, ratio_1)
        ratio_2 = ft.pair_flow_ratio(stat1, flow_dict)
        self.assertEqual(5, ratio_2)
Пример #7
0
def worker_task(files, args, worker_id):
    featurizer = Featurizer(Settings())
    Counter = 0  # Number of articles processed
    file_count = 0
    text = []
    for inputfile in files:
        sys.stdout.write("{}: Processing file:{}\n".format(
            worker_id, inputfile))
        dic = {}
        with open(inputfile, "r") as F:
            file_name, _ = splitext(basename(inputfile))
            F_out = open(join(args.outputpath, file_name + ".featues"), "w")
            text = []
            # All articles begin with '<doc' and end with '</doc>'
            for line in F:
                if line.startswith("<doc"):
                    continue
                if line.startswith("</doc>"):
                    # some paragraph ends
                    featurizer.featurize(text)
                    Counter += 1
                    if Counter % 1 == 0:
                        sys.stdout.write(
                            "{}: Finished processing article:{}\n".format(
                                worker_id, Counter))
                        if Counter % 50 == 0:
                            exit(0)
                    text = []
                    continue
                text.extend(
                    word_tokenize(
                        filter_with_alphabet(sanitize_line(line),
                                             args.alphabet)))
                # F_out.write(str(featurizer.featurize(word_tokenize(filter_with_alphabet(sanitize_line(line), args.alphabet)))))
                # text.extend()
                #
            F_out.close()
        sys.stdout.write("{}: Finished processing file:{}\n".format(
            worker_id, inputfile))
        file_count += 1
        # clear up
        del dic
Пример #8
0
def main():

    if opts.input == None:
        docs_in = sys.stdin
    else:
        docs_in = open(opts.input)

    if opts.output == None:
        scores_out = sys.stdout
    else:
        scores_out = open(opts.output, 'w')

    bundle = pickle.load(open(opts.model))
    clf = bundle['clf']
    dv = bundle['dv']
    ftzr = Featurizer(parsecachepath=opts.cache, use=opts.features)

    if opts.preproc == 'nltk':
        preprocessor = NLTKPreprocessor()
    else:
        preprocessor = StanfordPreprocessor()

    for doc in docs_in:
        if doc.strip() == '':
            scores_out.write('\n')
        else:
            if opts.nosplit:
                avg, fstr = score(doc, ftzr, dv, clf)
                out = '%s' % avg
                if opts.dump: out += '\t%s' % fstr
            else:
                sentences = preprocessor.parse(doc)['sentences']
                avg, scores, fstrs = score_doc(sentences, ftzr, dv, clf)
                out = '%s' % avg
                if opts.sentscores:
                    out += '\t%s' % (','.join(['%f' % s for s in scores]))
                if opts.dump: out += '\t%s' % ','.join(fstrs)
            scores_out.write('%s\n' % out)
    scores_out.close()
    ftzr.close()
Пример #9
0
 def synthesizeUniqueFeatures(self, intBaseFeat, boolBaseFeat,
                              baseFeatureValues, exclude):
     syntFeats: Tuple[
         PrecisFeature] = self.featureSynthesizer.synthesizeFeatures(
             intBaseFeat, boolBaseFeat, baseFeatureValues)
     # if boolBaseFeat empty, no derived bool features will be generated -> consider refactor
     genFeats: Tuple[
         PrecisFeature] = self.featureSynthesizer.GenerateDerivedFeatures(
             intBaseFeat, boolBaseFeat)
     derivFeats: Tuple[
         PrecisFeature] = Featurizer.mergeSynthesizedAndGeneratedFeatures(
             syntFeats, genFeats)
     uniqueDerivFeats = tuple([f for f in derivFeats if f not in exclude])
     return uniqueDerivFeats
Пример #10
0
def worker_task(files, args, worker_id):
    featurizer = Featurizer()
    cooc = {}  # in format (word1, word2) : count
    Counter = 0  # Number of articles processed
    file_count = 0
    text = []
    for inputfile in files:
        tokens_count = 0
        file_name, _ = splitext(basename(inputfile))
        F_out = open(join(args.outputpath, file_name + ".cooc_chunked"), 'wb')
        sys.stdout.write("{}: Processing file:{}\n".format(
            worker_id, inputfile))
        with open(inputfile, "r") as F:
            text = []
            chars = 0
            # All articles begin with '<doc' and end with '</doc>'
            for line in F:
                if line.startswith("<doc"):
                    continue
                if line.startswith("</doc>"):
                    # some paragraph ends
                    tokens_count += process(" ".join(text), featurizer, cooc,
                                            args.window_size)
                    text = []
                    chars = 0
                    Counter += 1
                    if Counter % 500 == 0:
                        sys.stdout.write(
                            "{}: Finished processing article:{}\n".format(
                                worker_id, Counter))
                        dump_cooc_to_file(worker_id, cooc, F_out)
                        cooc = {}
                    continue
                text.append(line)  # Cannot be longer than 100000
                chars += len(line)
                if chars > 10000:
                    tokens_count += process_features(" ".join(text),
                                                     featurizer, cooc,
                                                     args.window_size)
                    text = []
                    chars = 0
        dump_cooc_to_file(worker_id, cooc, F_out)
        cooc = {}
        F_out.close()
        sys.stdout.write("{}: Finished processing file:{}: {} tokens\n".format(
            worker_id, inputfile, tokens_count))
        file_count += 1
Пример #11
0
    def test_unit_packet_pair_ratio(self):
        ft = Featurizer()

        ratio_1 = ft.packet_pair_ratio(0, 0)
        self.assertEqual(0, ratio_1)
        ratio_2 = ft.packet_pair_ratio(5, 0)
        self.assertEqual(0, ratio_2)
        ratio_3 = ft.packet_pair_ratio(5, 5)
        self.assertEqual(1, ratio_3)
        ratio_4 = ft.packet_pair_ratio(0, 1)
        self.assertEqual(1, ratio_4)
Пример #12
0
    def test_unit_bytes_per_packet(self):
        ft = Featurizer()

        ratio_1 = ft.bytes_per_packet(0, 0)
        self.assertEqual(0, ratio_1)
        ratio_2 = ft.bytes_per_packet(0, 5)
        self.assertEqual(0, ratio_2)
        ratio_3 = ft.bytes_per_packet(5, 5)
        self.assertEqual(1, ratio_3)
        ratio_4 = ft.bytes_per_packet(1, 0)
        self.assertEqual(0, ratio_4)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='For given wikipedia dump files, \
			generate dump of article n-gram statistics')
    parser.add_argument('-e',
                        '--embedding',
                        type=str,
                        help='path to embedding txt')
    parser.add_argument('--raw', action='store_true')
    args = parser.parse_args()
    print args

    with tf.Graph().as_default():
        config = Config()
        featurizer = Featurizer()

        model = Model(config)
        loader = Loader(featurizer)

        if args.raw:  # prep data
            loader.load_raw()
            exit(0)

        embeddings, embed_size = featurizer.labeler.load_embedding(
            args.embedding)
        # embed_size = 100
        config.dim_embedding = embed_size
        loader.load()
        # model.add_embeddings()
        model.add_embeddings(embeddings)
class FlowCleaning:

    featurizer = Featurizer()

    #############################################################################
    # flow_stat_clean(live, batch_number, poll_dur)
    #
    # Function to handle the full cleaning process
    # Either cleans all training batches
    # or cleans the batch with specified batch number for live classification
    #
    # Args:
    #    live: boolean True if live classification or False for training
    #    batch_number: batch_number to specify file to clean for live classification
    #    poll_dur: polling duration of stats used in feature generation
    #
    # Outputs cleaned flow batch to a new .csv file ready for classification
    #
    def flow_stat_clean(self, live, batch_number, poll_dur):

        if not live:
            file_num = 1
            # Clean all files in the training directory
            while (os.path.isfile("Neptune/stats_training/output" +
                                  str(file_num) + ".csv")):
                flow = "Neptune/stats_training/output" + str(file_num) + ".csv"
                target = "Neptune/stats_training/output" + str(
                    file_num) + "_target.txt"
                try:
                    flow_stats = open(flow, 'r')
                    flow_target = open(target, 'r')
                except:
                    logging.error('Unable to open stats and target files')

                batch_agg = self.batch_aggregate(flow_stats, flow_target,
                                                 False)

                clean_dir = "Neptune/stats_training/output" + str(
                    file_num) + "_cleaned.csv"
                target_dir = "Neptune/stats_training/output" + str(
                    file_num) + "_target_cleaned.txt"

                self.batch_cleaning(clean_dir, target_dir, batch_agg, False,
                                    poll_dur)
                file_num += 1
        else:
            file_num = batch_number
            flow_dir = "Neptune/stats_live/output" + str(file_num) + ".csv"
            try:
                flow_stats = open(flow_dir, 'r')
            except:
                logging.error('Unable to open: ' + str(flow_dir))

            batch_agg = self.batch_aggregate(flow_stats, -1, True)

            clean_dir = "Neptune/stats_live/output" + str(
                file_num) + "_cleaned.csv"

            self.batch_cleaning(clean_dir, -1, batch_agg, True, poll_dur)

    #############################################################################
    # batch_cleaning(clean_dir, target_dir, batch_agg, live, poll_dur)
    #
    # Ouputs the cleaned stats with new features to the appropriate file
    #
    # Args:
    #    clean_dir: directory for the output of cleaned stats
    #    target_dir: directory for the adjusts target/ground truth values
    #    batch_agg: dictionary of cleaned statistics
    #    live: boolean True if live classification
    #    poll_dur: polling duration of stats used in feature generation
    #
    def batch_cleaning(self, clean_dir, target_dir, batch_agg, live, poll_dur):

        try:
            flow_cleaned = open(clean_dir, 'w')
        except:
            logging.error('Unable to open flow_cleaned file')

        # Final cleaned feature labels
        flow_cleaned.write(
            "eth_src,eth_dst,ip_proto,state_flag,pkts,src_pkts,dst_pkts,bytes,src_bytes,dst_bytes,"
            +
            "pkts_per_sec,bytes_per_second,bytes_per_packet,packet_pair_ratio,pair_flow\n"
        )
        if not live:
            try:
                target_cleaned = open(target_dir, 'w')
                target_cleaned.write("target\n")
            except:
                logging.error('Unable to open target cleaned file')

        # Generate new features for each flow and write each flow stat to file
        for i in batch_agg:
            stat = batch_agg[i].split(",")
            stat[11] = int(poll_dur)
            features = self.featurizer.featurizer(stat, batch_agg)

            flow_cleaned.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                stat[0], stat[1], stat[2], stat[3], stat[4], stat[5], stat[6],
                stat[7], stat[8], stat[9], features))
            if not live:
                if '\n' not in str(stat[11]):
                    target_cleaned.write("{}".format(stat[12]))
                else:
                    target_cleaned.write("{}".format(stat[12]))

    #############################################################################
    # batch_aggregate(flow_stats, flow_target, live)
    #
    # Aggregates flows with same src and dst and same protocols together
    # This provides statistics on an eth_src->eth_dst basis
    # Also generates new flow values to enable further features to be calculated
    # by the Featurizer class
    #
    # Args:
    #    flow_stats: array of flow stat records
    #    flow_target: array of ground truth values corresponding to flow stats
    #
    # Returns:
    #    batch_dict: dictionary of aggregated flow statistics using src, dst
    #                and protocol as unique key values
    #
    def batch_aggregate(self, flow_stats, flow_target, live):

        if not live:
            target_lines = flow_target.readlines()
        batch_dict = {}

        line_number = 0
        first_line_flag = True
        clean_calc = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

        flow_stats.seek(0)
        for line in flow_stats:
            if first_line_flag:
                first_line_flag = False
                line_number += 1
                continue

            stats = line.split(",")
            if str(stats[6]) == 'man':
                continue

            if 's' in str(stats[15]):
                state_flag = 2
            else:
                state_flag = -1

            key = (str(stats[0]) + "_" + str(stats[1]) + "_" + str(stats[6]) +
                   "_" + str(state_flag))
            target = -1
            if not live:
                target = str(target_lines[line_number])

            # If flow stat exists in dictionary, aggregate the counted values such as pkt_count
            if key in batch_dict:
                old_stats = batch_dict[key].split(",")
                for i in range(len(old_stats)):
                    if i >= 4 and i <= 9:
                        clean_calc[i] = int(stats[i + 4]) + int(old_stats[i])
                    elif i == 10:
                        clean_calc[i] = int(old_stats[i]) + 1

                # Set target value based on previous flow stat
                if old_stats[len(old_stats) - 1] == 1 and target == 0:
                    target = 1

                batch_dict[key] = (
                    str(stats[0]) + "," + str(stats[1]) + "," + str(stats[6]) +
                    "," + str(state_flag) + "," + str(clean_calc[4]) + "," +
                    str(clean_calc[5]) + "," + str(clean_calc[6]) + "," +
                    str(clean_calc[7]) + "," + str(clean_calc[8]) + "," +
                    str(clean_calc[9]) + "," + str(clean_calc[10]) + "," +
                    str(0) + "," + str(target))

            else:
                batch_dict[key] = (str(stats[0]) + "," + str(stats[1]) + "," +
                                   str(stats[6]) + "," + str(state_flag) +
                                   "," + str(stats[8]) + "," + str(stats[9]) +
                                   "," + str(stats[10]) + "," +
                                   str(stats[11]) + "," + str(stats[12]) +
                                   "," + str(stats[13]) + "," + str(1) + "," +
                                   str(0) + "," + str(target))

            line_number += 1

        return batch_dict

    #############################################################################
    # aggregate_stats(dir)
    #
    # Aggregates all individual cleaned stat files in the training directory
    # into one file to train on
    #
    # Outputs aggregated flow file and target file
    #
    def aggregate_stats(self, dir):

        flow_stats = open(dir + "FlowStats_cleaned.csv", "w")
        flow_target = open(dir + "FlowStats_target_cleaned.txt", "w")

        # Process first file and include header labels
        file_num = 1
        for line in open(dir + "output" + str(file_num) + "_cleaned.csv"):
            flow_stats.write(line)
        for line in open(dir + "output" + str(file_num) +
                         "_target_cleaned.txt"):
            flow_target.write(line)
        file_num += 1

        # Process remainder, excluding label headers
        while (os.path.isfile(dir + "output" + str(file_num) +
                              "_cleaned.csv")):
            flow = open(dir + "output" + str(file_num) + "_cleaned.csv")
            target = open(dir + "output" + str(file_num) +
                          "_target_cleaned.txt")
            flow.next()
            target.next()
            for line in flow:
                flow_stats.write(line)
            for line in target:
                flow_target.write(line)
            file_num += 1
        flow_stats.close()
Пример #15
0
        'description', 'cross', 'north', 'south', 'east', 'west', '-PRON-',
        'pron', 'nee', 'regard', 'shall', 'use', 'win', 'park', 'point',
        'biking', 'follow', 'single', 'track', 'intersection', 'trailhead',
        'head', 'good', 'great', 'nice', 'time', 'include', 'place', 'come',
        'downhill', 'look', 'near'
    ])
    bitri_stops = set([
        'parking_lot', 'trail_starts', 'mile_turn', 'north_south',
        'mountain_bike', 'mountain_biking', 'single_track',
        'mountain_bike_trail', 'trail_head'
    ])
    second_stopwords = my_stopwords.union(STOPWORDS).union(bitri_stops)

    # Gensim LDA
    st_featurizer = Featurizer(first_stopwords=first_stopwords,
                               second_stopwords=second_stopwords,
                               bigrams=True,
                               trigrams=True)
    processed_docs = st_featurizer.featurize(X)
    bow_corpus, id2word = make_gensim_bow(processed_docs,
                                          no_below=3,
                                          no_above=0.6,
                                          keep_n=10000)

    k = 6
    lda_model = LdaMulticore(bow_corpus,
                             num_topics=k,
                             id2word=id2word,
                             passes=5,
                             workers=2,
                             iterations=100)
    perplexity, coherence = get_perplexity_coherence(lda_model, bow_corpus,
Пример #16
0
def process_dataset(core_path, refined_path, dataset_name, output_path,
                    cutoff):
    core_set_list = [x for x in os.listdir(core_path) if len(x) == 4]
    refined_set_list = [x for x in os.listdir(refined_path) if len(x) == 4]
    path = refined_path

    # atomic sets for long-range interactions
    atom_types = [6, 7, 8, 9, 15, 16, 17, 35, 53]
    atom_types_ = [6, 7, 8, 16]

    # atomic feature generation
    featurizer = Featurizer(save_molecule_codes=False)
    processed_dict = {}
    for name in tqdm(os.listdir(path)):
        if len(name) != 4:
            continue
        processed_dict[name] = gen_feature(path, name, featurizer)

    # interaction features
    processed_dict = pairwise_atomic_types(path, processed_dict, atom_types,
                                           atom_types_)
    # load pka (binding affinity) data
    pk_dict = load_pk_data(path + 'index/INDEX_general_PL_data.2016')
    data_dict = processed_dict
    for k, v in processed_dict.items():
        v['pk'] = pk_dict[k]
        data_dict[k] = v

    refined_id, refined_data, refined_pk = [], [], []
    core_id, core_data, core_pk = [], [], []

    for k, v in tqdm(data_dict.items()):
        ligand = (v['lig_fea'], v['lig_co'], v['lig_atoms'], v['lig_eg'])
        pocket = (v['pock_fea'], v['pock_co'], v['pock_atoms'], v['pock_eg'])
        graph = cons_lig_pock_graph_with_spatial_context(ligand,
                                                         pocket,
                                                         add_fea=3,
                                                         theta=cutoff,
                                                         keep_pock=False,
                                                         pocket_spatial=True)
        cofeat, pk = v['type_pair'], v['pk']
        graph = list(graph) + [cofeat]
        if k in core_set_list:
            core_id.append(k)
            core_data.append(graph)
            core_pk.append(pk)
            continue
        refined_id.append(k)
        refined_data.append(graph)
        refined_pk.append(pk)

    # split train and valid
    train_idxs, valid_idxs = random_split(len(refined_data),
                                          split_ratio=0.9,
                                          seed=2020,
                                          shuffle=True)
    train_g = [refined_data[i] for i in train_idxs]
    train_y = [refined_pk[i] for i in train_idxs]
    valid_g = [refined_data[i] for i in valid_idxs]
    valid_y = [refined_pk[i] for i in valid_idxs]
    train = (train_g, train_y)
    valid = (valid_g, valid_y)
    test = (core_data, core_pk)

    with open(os.path.join(output_path, dataset_name + '_train.pkl'),
              'wb') as f:
        pickle.dump(train, f)
    with open(os.path.join(output_path, dataset_name + '_val.pkl'), 'wb') as f:
        pickle.dump(valid, f)
    with open(os.path.join(output_path, dataset_name + '_test.pkl'),
              'wb') as f:
        pickle.dump(test, f)
Пример #17
0
    def learn3(self, k, intBaseFeat, boolBaseFeat, baseFeatureValues, exclude,
               call):
        #on the empty set of data points, return true
        if len(baseFeatureValues) == 0:
            print("called learn3 with 0 feature vectors")
            logger.info("called learn3 with 0 feature vectors")
            return PrecisFormula(BoolVal(False))
        #rename  splitIntoBoolAndIntFeatureVectors
        (intBaseFeatVectors,
         boolBaseFeatVectors) = Featurizer.getBoolAndIntFeatureVectors(
             intBaseFeat, boolBaseFeat, baseFeatureValues)

        derivFeats = self.synthesizeUniqueFeatures(intBaseFeat, boolBaseFeat,
                                                   baseFeatureValues, exclude)
        derivFeatVectors: List[
            FeatureVector] = Featurizer.generateDerivedFeatureVectors(
                derivFeats, intBaseFeat + boolBaseFeat, baseFeatureValues)
        #assert(len(baseFeatureValues) == len(derivFeatVectors))
        boolFvs = Featurizer.mergeFeatureVectors(boolBaseFeatVectors,
                                                 derivFeatVectors)

        houdini = Houdini()
        (allTrueFormula,
         indicesAllwaysTrue) = houdini.learn2(boolBaseFeat + derivFeats,
                                              boolFvs, call)
        logger.info("Houdini AlwaysTrue for k=" + str(k) + " : " +
                    allTrueFormula.toInfix() + "\n")

        if k == 0:
            return allTrueFormula
        else:
            #removing features returned by houdini and their corresponding feature vector entries.
            (remainingBaseBoolFeat, remainingDerivBoolFeat, featuresRemoved)  = \
                self.removeFeatureFromFeaturelist(boolBaseFeat, derivFeats, indicesAllwaysTrue)

            (reaminingEntriesBaseBoolFv, reaminingEntriesDerivBoolFv) = \
                self.removeFeatureEntryInFeatureVectors(boolBaseFeatVectors, derivFeatVectors, indicesAllwaysTrue)

            # features that are true on parent node should not be passed down to children;(they are redundantly also true in child nodes)
            exclude = exclude + featuresRemoved
            lookAhead = len(intBaseFeatVectors[0])

            ######################################
            #bug: chooseFeatureImplication does not update reamining bool features or feature vectors. Idx is with respect to updates
            (f,idx, posBaseFv, negBaseFv, remainingBaseBoolFeat, remainingDerivBoolFeat ) = \
                self.chooseFeatureImplication(allTrueFormula,intBaseFeat,remainingBaseBoolFeat , remainingDerivBoolFeat, \
                    Featurizer.mergeFeatureVectors(intBaseFeatVectors,reaminingEntriesBaseBoolFv) , reaminingEntriesDerivBoolFv, lookAhead, call )
            ######################################
            if idx < 0:
                print("Predicate: " + call + " for k = " + str(k) + " : None")
                logger.info("Predicate: " + call + " for k = " + str(k) +
                            " : None" + "\n")
                return allTrueFormula
            #TODO: choose should return boolBasePosFv and intBasePosFv ...
            #(f,idx, posBaseFv, negBaseFv) = \
            #    self.chooseFeature2(remainingBaseBoolFeat + remainingDerivBoolFeat, \
            #        Featurizer.mergeFeatureVectors(intBaseFeatVectors,reaminingEntriesBaseBoolFv), reaminingEntriesDerivBoolFv, call, lookAhead)
            logger.info("Predicate: " + call + " for k = " + str(k) + " : " +
                        str(f) + "\n")
            print("Predicate chosen at " + call + " : " + str(f))

            #featureSplitRemoved == f
            (newBoolBaseFeat, newDeriveBaseFeat, featureSplitRemoved) = \
                self.removeFeatureFromFeaturelist(remainingBaseBoolFeat, remainingDerivBoolFeat, [idx])
            # if predicate to split on is in derivedFeatures, then add to exclude list;
            if len(remainingBaseBoolFeat) == len(newBoolBaseFeat):
                exclude = exclude + (f, )
            else:
                # if predicate to split is in baseFeatures, the update posBaseFv and negBaseFv feature vectors
                posBaseFv = self.removeFeatureEntryInBaseFv(
                    posBaseFv, [idx + lookAhead])
                negBaseFv = self.removeFeatureEntryInBaseFv(
                    negBaseFv, [idx + lookAhead])

            posPost = self.learn3( k-1,\
                intBaseFeat, newBoolBaseFeat, posBaseFv, exclude, call + " Left")  #recursive call

            logger.info(call + " Left: " + " for k = " + str(k) + " : " +
                        posPost.toInfix())
            print(call + " Left: " + " for k = " + str(k) + " : " +
                  posPost.toInfix())

            negPost = self.learn3( k-1,\
                intBaseFeat, newBoolBaseFeat, negBaseFv, exclude, call +" Right") #recursive call

            logger.info(call + " Right: " + " for k = " + str(k) + " : " +
                        negPost.toInfix())
            print(call + " Right: " + " for k = " + str(k) + " : " +
                  negPost.toInfix())

            disjunctivePost = And(
                allTrueFormula.formulaZ3,
                Or(And(posPost.formulaZ3, f.varZ3),
                   And(negPost.formulaZ3, Not(f.varZ3))))
            precisPost = PrecisFormula(disjunctivePost)
            return precisPost
Пример #18
0
def learnPostUpToK(p, PUTName, outputFile, k, destinationOfTests):
    sygusExecutable = "Precis/Learners/EnumerativeSolver/bin/starexec_run_Default"
    tempLocation = "tempLocation"
    sygusFileName = "postcondition.sl"
    #assumes MSBuils.exe in path
    inst = Instrumenter(
        "MSBuild.exe",
        "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe")
    p.ExtractObservers(PUTName, outputFile)

    # returns list of base features
    baseFeatures: Tuple[PrecisFeature] = p.ReadObserversFromFile(outputFile)
    allPostconditions = []
    allBaseFeatureVectors = []

    synthesizer = FeatureSynthesis(sygusExecutable, tempLocation,
                                   sygusFileName)
    currentPostcondition = PrecisFormula(BoolVal(False))
    inst.instrumentPost(p, currentPostcondition, PUTName)
    rounds = 1
    totalPexTime = 0.0
    totalLearningTime = 0.0
    while True:
        print("starting round: " + str(rounds))
        pex = Pex()

        startTimePex = time.time()
        baseFeatureVectors: List[FeatureVector] = pex.RunTeacher(
            p, PUTName, baseFeatures)
        pexTime = time.time() - startTimePex
        totalPexTime += pexTime
        print("pex time: " + str(totalPexTime))
        print("learning time: " + str(totalLearningTime))

        evaluation.copyTestFilesToEvaluationDir(pex.testsLocation,
                                                destinationOfTests, rounds)
        #sys.exit(0)
        allBaseFeatureVectors.extend(baseFeatureVectors)

        if all(baseFeatureVectors[i].testLabel
               for i in range(0, len(baseFeatureVectors))):
            print("found it\n************************\n")
            simplifiedPost = PrecisFormula(
                currentPostcondition.precisSimplify())
            return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len(
                allBaseFeatureVectors)

            # # Shambo: adding negetion checking

            # negPost = PrecisFormula(Not(currentPostcondition.formulaZ3))

            # inst = Instrumenter(
            #     "MSBuild.exe", "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe")
            # inst.instrumentPost(p, negPost, PUTName)

            # negBaseFeatureVectors: List[FeatureVector] = pex.RunTeacher(p, PUTName, baseFeatures)

            # if len(negBaseFeatureVectors) == 0:
            #     print ( "truly found it")
            #     simplifiedPost = PrecisFormula(currentPostcondition.precisSimplify())
            #     return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len(allBaseFeatureVectors)

            # else:
            #     print("fake found it")

            #     for i in range(0,len(negBaseFeatureVectors)):
            #         negBaseFeatureVectors[i].testLabel = "True"

            #     baseFeatureVectors.extend(negBaseFeatureVectors)
            #     allBaseFeatureVectors.extend(negBaseFeatureVectors)

        if rounds == 16:
            print("BAD!")
            simplifiedPost = PrecisFormula(
                currentPostcondition.precisSimplify())
            return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len(
                allBaseFeatureVectors)

        if len(baseFeatureVectors) == 0:
            logger1.info(
                "process TERMINATED with TG not generating any test! DEBUG ME!\n"
            )
            simplifiedPost = PrecisFormula(
                currentPostcondition.precisSimplify())
            return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len(
                allBaseFeatureVectors)

        intBaseFeatures, boolBaseFeatures = Featurizer.getIntAndBoolFeatures(
            baseFeatures)
        disLearner = DisjunctiveLearner(synthesizer)

        logger1.info("#############\nRound: " + str(rounds) + "\n")
        # Learning function
        startLearningTime = time.time()
        postcondition = disLearner.learn3(k, intBaseFeatures, boolBaseFeatures,
                                          allBaseFeatureVectors, (), "root")
        learningTime = time.time() - startLearningTime
        totalLearningTime += learningTime

        logger1.info("unsimplified post:\n" + postcondition.toInfix() + "\n")

        print("unsimplified post " + postcondition.toInfix())
        print("simplified post " +
              PrecisFormula(postcondition.precisSimplify()).toInfix())

        # Shambo
        # Always insert simplified formula
        postcondition = PrecisFormula(postcondition.precisSimplify())

        # assumes ms build in path
        inst = Instrumenter(
            "MSBuild.exe",
            "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe")
        inst.instrumentPost(p, postcondition, PUTName)

        currentPostcondition = PrecisFormula(postcondition.formulaZ3)
        allPostconditions.append(postcondition.formulaZ3)
        rounds = rounds + 1
Пример #19
0
from featurizer import Featurizer,convert_lab
from colorizer import Colorizer



def get_grayscale(image):
	gray = cv2.cvtColor(image, cv.CV_BGR2GRAY)
	return cv2.merge((gray,gray,gray))


if __name__=="__main__":
	training_images = ["images/grass1.jpg","images/grass2.jpg"]
	test_image = skio.imread("images/grass3.jpg")

	#getting the right featurizer
	f = Featurizer(training_images)
	f.compute_k_means()
	print "Getting features..."
	f.compute_features()
	gray_test = get_grayscale(test_image)
	


	#getting the right colorizer
	colorizer = Colorizer(f)
	print "Starting Training of SVMs..."
	colorizer.train()

	#running the experiment
	print "Colorizing Image..."
	colored_image = colorizer.color_image(gray_test)
Пример #20
0
def main():
    fearturizer = Featurizer()
    (options, args) = parser.parse_args()
    if not options.filename and not options.dir:
        parser.error('missing -f or -d option')
    if options.filename and options.dir:
        parser.error('please choose only one option to run feature extraction')
    if options.out:
        if not os.path.isdir(options.out):
            parser.error('invalid output path')
        else:
            fearturizer.set_out_dir(options.out)
    if options.filename:
        if not os.path.isfile(options.filename):
            parser.error("" + options.filename + " is not a file")
        else:
            fearturizer.set_in_path(options.filename)
    if options.dir:
        if not os.path.isdir(options.dir):
            parser.error("" + options.dir + " is not a directory")
        else:
            fearturizer.set_in_path(options.dir)
    fearturizer.set_restart(options.should_restart)
    fearturizer.prepare()
    fearturizer.run()
Пример #21
0
    def chooseFeatureImplication(self, alwaysTrueFormula, intBaseFeatures, baseBoolFeatures, \
         derivBoolFeatures, baseFv, derivFv, lookAhead, call ):
        houdini = Houdini()
        fvPos = list()
        fvPosDeriv = list()
        fvNeg = list()
        fvNegDeriv = list()
        irrelevantFeatures = ()
        irrelevantIndices = []
        boolFeatures = baseBoolFeatures + derivBoolFeatures
        for idx in range(0, len(boolFeatures)):
            #region pruneFunction
            feature = boolFeatures[idx]
            if is_int(feature.varZ3):
                assert (False)
            (fvPos, fvPosDeriv, fvNeg,
             fvNegDeriv) = self.splitSamplesImplication(
                 feature, idx + lookAhead, baseFv, derivFv)
            #if len(fvPos) == 0 or len(fvNeg) == 0:
            #irrelevantIndices.append(idx)
            #continue

            (posIntBaseFv,
             posBoolBaseFv) = Featurizer.getBoolAndIntFeatureVectors(
                 intBaseFeatures, baseBoolFeatures, fvPos)
            (negIntBaseFv,
             negBoolBaseFv) = Featurizer.getBoolAndIntFeatureVectors(
                 intBaseFeatures, baseBoolFeatures, fvNeg)

            posFvs = Featurizer.mergeFeatureVectors(posBoolBaseFv, fvPosDeriv)
            negFvs = Featurizer.mergeFeatureVectors(negBoolBaseFv, fvNegDeriv)

            (posAllTrueFormula, posIndicesAllwaysTrue) = houdini.learn2(
                boolFeatures, posFvs, call +
                " from implication check-- split from pred " + str(feature))
            (negAllTrueFormula, negIndicesAllwaysTrue) = houdini.learn2(
                boolFeatures, negFvs, call +
                " from implication check-- split from pred " + str(feature))
            if len(fvPos) != 0 and len(fvNeg) != 0:
                logger.info(call + " implication check-- split pred: " +
                            str(feature))
                logger.info(call + " implication check-- featurePos: " +
                            str(posAllTrueFormula.toInfix()))
                logger.info(call + " implication check-- featureNeg: " +
                            str(negAllTrueFormula.toInfix()) + "\n")

            #disjunct z3 type
            disjunct = Or(And(posAllTrueFormula.formulaZ3, feature.varZ3),
                          And(negAllTrueFormula.formulaZ3, Not(feature.varZ3)))
            implication = Implies(alwaysTrueFormula.formulaZ3, disjunct)
            solver = Solver()
            # check (not (postK0 => postK1)) is unsat
            solver.add(Not(implication))
            check = solver.check()
            #splitting on `feature does not` add new information: alwaysTrueFormula -> (OR(f and posSplit, ~f and negSplit)) is valid
            if str(check) == 'unsat':
                #collect irrelevant features and indices to remove
                irrelevantFeatures = irrelevantFeatures + (feature, )
                irrelevantIndices.append(idx)
            #splitting adds new information
            elif str(check) == 'sat':
                pass
            else:
                # solver does not know
                assert (False)
            #endregion

        copyBaseIntFeat = tuple(intBaseFeatures)
        copyBaseBoolFeat = tuple(baseBoolFeatures)
        copyDerivFeat = tuple(derivBoolFeatures)
        #(remainingBaseBoolFeat, remainingDerivBoolFeat, featuresRemoved)  = \
        #    self.removeFeatureFromFeaturelist(boolBaseFeat, derivFeats, indicesAllwaysTrue)
        (intBaseFv, boolBaseFv) = Featurizer.getBoolAndIntFeatureVectors(
            copyBaseIntFeat, copyBaseBoolFeat, baseFv)

        (copyRemainingBaseBoolFeat, copyRemainingDerivBoolFeat, featuresRemoved) = \
            self.removeFeatureFromFeaturelist(copyBaseBoolFeat, copyDerivFeat, irrelevantIndices)

        #boolFvs = Featurizer.mergeFeatureVectors(boolBaseFv, derivFv)
        (copyReaminingEntriesBaseBoolFv, reaminingEntriesDerivBoolFv) = \
            self.removeFeatureEntryInFeatureVectors(boolBaseFv, derivFv, irrelevantIndices)
        #Debug Check
        if (len(copyRemainingBaseBoolFeat) +
                len(copyRemainingDerivBoolFeat)) == 0:
            return (None, -1, None, None, None, None)
        skipAhead = len(intBaseFv[0])
        newBaseFv = Featurizer.mergeFeatureVectors(
            intBaseFv, copyReaminingEntriesBaseBoolFv)

        (f, idx, posBaseFv, negBaseFv) = self.chooseFeature2(
            copyRemainingBaseBoolFeat + copyRemainingDerivBoolFeat, newBaseFv,
            reaminingEntriesDerivBoolFv, call, skipAhead)
        #print(irrelevantIndices)

        #intBaseFeatures = copyBaseIntFeat
        #baseBoolFeatures = copyRemainingBaseBoolFeat
        #erivBoolFeatures = copyDerivFeat
        #baseFv = newBaseFv
        #derivFv = reaminingEntriesDerivBoolFv
        return (f, idx, posBaseFv, negBaseFv, copyRemainingBaseBoolFeat,
                copyRemainingDerivBoolFeat)
Пример #22
0
  optparser = optparse.OptionParser()
  optparser.add_option("-d", "--dir", dest="dir",  default="data/", help="Root data directory")
  optparser.add_option("-f", "--features", dest="features",  default='light', help="Comma separated list of feature groups to use")
  optparser.add_option("-v", "--save", dest="save",  default=False, action="store_true", help="Train a model and save it to the specified file.")
  optparser.add_option("-m", "--modelfile", dest="modelfile", help="File to read model from/write model to.")
  optparser.add_option("-p", "--predict", dest="predict",  default=False, action="store_true", help="Load a saved mode and use it to on unseen data.")
  optparser.add_option("-x", "--feature_selection", dest="feature_selection",  default=False, action="store_true", help="Print performance of feature groups one at a time.")
  optparser.add_option("-e", "--extra_train", dest="extra_train",  default=None, type="string", help="Add extra (possibly out of domain) data to training")
  optparser.add_option("-a", "--ablation", dest="ablation",  default=False, action="store_true", help="Run ablation analysis by feature group.")
  optparser.add_option("-r", "--print_best_features", dest="print_best_features",  default=False, action="store_true", help="Print features with highest weights.")
	
  (opts, _) = optparser.parse_args()

  label_file = "%s/labels"%opts.dir
  ftzr = Featurizer(use=opts.features)

  if opts.predict : 
    bundle = pickle.load(open(opts.modelfile))
    clf = bundle['clf']
    dv = bundle['dv']
    _, X, _, nm = get_data(label_file, ftzr, dv, encodeY=False)

  else : 
    y, X, dv, nm = get_data(label_file, ftzr)
    yplus = None
    Xplus = None
    nmplus = None
    if opts.extra_train is not None : 
      for dr in opts.extra_train.split(',') : 
        label_file = "%s/labels"%dr
Пример #23
0

fragment = "your head look like a ball however hubert has a head which is a polygon this difference derives from the fact that hubert is gamma perturbation stable"
fragment = fragment.split(" ")
all_grams = {}
counter = 0
for word in fragment:
    all_grams[word] = (counter, 0)
    counter += 1
all_grams["your head"] = (counter, 0)
counter += 1
all_grams["a ball"] = (counter, 0)
counter += 1
all_grams["derives from"] = (counter, 0)
counter += 1
featurizer = Featurizer(Settings())
result = {}
process(fragment, featurizer, result)
print result

### DELETED
"""
calc_cooccurence(fragment, all_grams) is a function that will calculate the coocccurence matrix
for a given fragment and dumps the partial result into a dictionary. These dictionaries from
different fragments should be combined to generate the final result.

fragment:	A list of strings/tokens (not integer labels) representing the raw text.
	Note -	Windows crossing fragments will	be ignored; to make the cooccurence result more
			precise, fragment should be a relatively longer list

all_grams:	A dictionary of n-grams (including 1-gram/word) that we will care about. This dict