Пример #1
0
def test_avg(iterations, test_file, beam_size):
    data = prepare_data.read_file(test_file)
    feature = Feature()
    decoder = Decoder(beam_size, feature.get_score)

    count = 0
    data_size = len(data)

    model_file = open(
        '/home/xzt/CWS/model_result/avg-model_beam-size-' + str(beam_size) +
        '.pkl', 'rb')
    feature.load_model(model_file)
    model_file.close()
    for line in data:
        z = decoder.beamSearch(line)
        seg_data = ' '.join(z)
        seg_data_file = '/home/xzt/CWS/test_seg_data/avg-test-seg-data' + '_beam-size-' + str(
            beam_size) + '.txt'
        with open(seg_data_file, 'a') as f:
            f.write(seg_data + '\n')
        count += 1
        if count % 1000 == 0:
            print("segment with avg-model, finish %.2f%%" %
                  ((count / data_size) * 100))
    f.close()
    print("segment with avg model finish")
    def create_all_relations_and_features(self):
        """Creating all possible relations between all entities.

        Creating features here for performance.
        """
        all_relations = []

        feature = None

        for source in self.events + self.timex:
            for target in self.events + self.timex:
                for i, time in enumerate(RelationType()):
                    new_relation = Relation("all", self, source, target, time)

                    # We don't have the feature yet
                    if i == 0:
                        f = Feature(new_relation)
                        feature = f.get_feature()

                    if new_relation in self.relations:
                        continue
                    else:
                        new_relation.set_feature(feature)
                        all_relations.append(new_relation)

                feature = None

        self.relations = self.relations + all_relations
Пример #3
0
 def build_features(self, image_shape):
     height, width = image_shape
     features = []
     # TODO: play with minimum feature size
     for w in range(1, width+1):
         for h in range(1, height+1):
             x = 0
             while x + w < width:
                 y = 0
                 while y + h < height:
                     # 2 horizontally aligned blocks
                     root = Region(x,y,w, h)
                     right = Region(x+w, y, w,h)
                     # check if the VJ feature can be fit into the image
                     if x + 2 * w < width:
                         features.append(Feature([right], [root]))
                     bottom = Region(x, y+h, w, h)
                     # 2 vertically aligned blocks
                     if y + 2 * h < height:
                         features.append(Feature([root],[bottom]))
                     # 3 horizontally aligned blocks 
                     right2 = Region(x+2*w, y, w,h)
                     if x + 3 * w < width:
                         features.append(Feature([right], [right2, root]))
                     cross_bottom = Region(x+w, y+h, w, h)
                     if x + 2 * w < width and y + 2 * h < height:
                         features.append(Feature([right, bottom], [root, cross_bottom]))
                     y += 1
                 x += 1
     return features
Пример #4
0
def flw_dataset_classify():
    f = Feature()
    paths, classes = loadFaceData('face.csv', nrows=82)
    X = []
    y = []
    for index, path in enumerate(paths):
        ar = f.getFeature(path)
        print(index, path)
        if ar.all() == 0:
            continue
        X.append(ar)
        y.append(classes[index])
    X = np.array(X)
    y = np.array(y)
    print(X.shape)
    print(X)
    print(y)
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
        X, y, test_size=0.3, stratify=y)
    nearestCentroid = NearestCentroid()
    nearestCentroid.fit(X_train_data, y_train_data)

    predict_y = nearestCentroid.predict(X_test_data)
    acc = accuracy_score(y_test_data, predict_y)

    print(acc)
Пример #5
0
def scut_fbp_test():
    f = Feature()
    # af1and5 0.890287769784
    paths, classes = loadFaceData(
        './dataset/af1and5.csv',
        nrows=100)  # './dataset/all(round_score).csv' for full class
    X = []
    y = []
    for index, path in enumerate(paths):
        ar = f.getFeature(path)
        print(index, path)
        if ar.all() == 0:
            continue
        X.append(ar)
        y.append(round(classes[index]))
    X = np.array(X)
    y = np.array(y)
    print(X.shape)
    print(X)
    print(y)
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
        X, y, test_size=0.3, stratify=y)
    nearestCentroid = NearestCentroid()
    nearestCentroid.fit(X_train_data, y_train_data)

    predict_y = nearestCentroid.predict(X_test_data)
    acc = accuracy_score(y_test_data, predict_y)

    print(acc)
Пример #6
0
 def tf_idf_training(comment_cnt_lower_bound, train_ratio):
     # train_set, cv_set = Train.simple_partition(comment_cnt_lower_bound, \
     #         train_ratio)
     # idf_dict = Feature.cal_idf(train_set, Config.train_idf_path)
     train_set = Train.get_train_set()
     Feature.cal_tf_idf(train_set, Config.train_idf_path,
                        Config.train_tf_idf_path, True, 200)
     Genome.cal_tf_idf(Config.train_tf_idf_path, \
             Config.train_genome_tf_idf_path, 200)
def decorate_outside(obj, options=Map()):

    obj.points = []
    obj.points_edges = []
    obj.material_clear = Blocks.AIR

    border = flatten_list_of_lists(
        [vg.get_line_from_points(l[0], l[1]) for l in options.lines])

    if options.options.outside == "flowers":
        flowers_1 = []
        flowers_2 = []
        for i, b in enumerate(border):
            # TODO: Refactor to have multiple numbers of flowers

            if (i % 2) == 0:
                flowers_1.append(b)
            else:
                flowers_2.append(b)

        colors = Blocks.kind("Flower")
        np.random.shuffle(colors)

        obj.features.append(
            Feature("flowers", flowers_1, Map(material=colors[0])))
        obj.features.append(
            Feature("flowers", flowers_2, Map(material=colors[1])))

    elif options.options.outside == "trees":
        trees = []
        for i, b in enumerate(border):
            if (i % 3) == 0:
                trees.append(b)

        colors = Blocks.kind("Sapling")
        np.random.shuffle(colors)

        obj.features.append(Feature("flowers", trees, Map(material=colors[0])))

    elif options.options.outside == "grass":
        trees = []
        for i, b in enumerate(border):
            if (i % 3) == 0:
                trees.append(b)

        obj.features.append(
            Feature("flowers", trees, Map(material=Blocks.DOUBLETALLGRASS)))

    elif options.options.outside == "fence":
        fence_type = np.random.random_integers(188, 192)
        obj.features.append(Feature("fence", border, Map(material=fence_type)))

    return obj
Пример #8
0
def feature_detection(S_ana_log):

    indices = np.argwhere(
        ~np.isnan(S_ana_log))  # Find all non-NaN indices in S_ana_log
    #print(np.shape(indices))
    Features_list = []  # initialize Features_list

    for [x_ind, y_ind] in indices:  # For each pair of indices (pixel)

        if len(Features_list) == 0:  # if len of feature_list is 0
            newFeature = Feature(x_ind, y_ind)  # create a new feature
            Features_list.append(newFeature)  # add new feature to feature_list

        else:

            border_list = [
            ]  # initialize list of logicals on where a Feature borders current pixel
            sublist = [
            ]  # initialize list of neighboring Features that boarder current pixel
            for currentFeature in Features_list:  # for each feature in list

                border_list.append(currentFeature.borders(
                    x_ind, y_ind))  # find all features that boarders the pixel

            indslist = np.where(border_list)[0]
            if len(indslist
                   ) == 1:  # if current pixel boarders the exactly 1 Feature
                hunterFeature = Features_list[indslist[0]]  # find that Feature
                hunterFeature.add(x_ind,
                                  y_ind)  # add current pixel to that Feature

            if len(indslist
                   ) > 1:  # if the current pixel boarders more than 1 Feature
                for ind in indslist:
                    sublist.append(
                        Features_list[ind])  # add those Features to a list
                for s in sublist:
                    Features_list.remove(
                        s)  # remove those Features from Features_list

                hunterFeature = conjoin(
                    sublist)  # conjoin all Features in sublist
                hunterFeature.add(
                    x_ind, y_ind)  # add current pixel to conjoined Feature
                Features_list.append(
                    hunterFeature)  # add conjoined Feature to Feature_list

            else:  # if current pixel does not boarder any existing features
                newFeature = Feature(x_ind, y_ind)  # create a new feature
                Features_list.append(
                    newFeature)  # add new feature to feature_list

    return (Features_list, indices)
Пример #9
0
    def saveWordNetFeatures(self, fileOut):
        feature = Feature()

        synSynToScore = {}
        xuidPairs = self.getAllXUIDPairs()
        print("calculating wordnet features for", len(xuidPairs),
              "unique pairs")
        i = 0
        completed = set()
        for xuid1, xuid2 in xuidPairs:
            uid1 = self.corpus.XUIDToMention[xuid1].UID
            uid2 = self.corpus.XUIDToMention[xuid2].UID
            if (uid1, uid2) in completed or (uid2, uid1) in completed:
                continue
            completed.add((uid1, uid2))
            textTokens1 = self.corpus.XUIDToMention[xuid1].text
            textTokens2 = self.corpus.XUIDToMention[xuid2].text
            bestScore = -1
            for t1 in textTokens1:
                syn1 = wn.synsets(t1)
                if len(syn1) == 0:
                    continue
                syn1 = syn1[0]
                for t2 in textTokens2:
                    syn2 = wn.synsets(t2)
                    if len(syn2) == 0:
                        continue
                    syn2 = syn2[0]
                    curScore = -1
                    if (syn1, syn2) in synSynToScore:
                        curScore = synSynToScore[(syn1, syn2)]
                    elif (syn2, syn1) in synSynToScore:
                        curScore = synSynToScore[(syn2, syn1)]
                    else:  # calculate it
                        curScore = wn.wup_similarity(syn1, syn2)
                        # don't want to store tons.  look-up is cheap
                        synSynToScore[(syn1, syn2)] = curScore
                        if curScore != None and curScore > bestScore:
                            bestScore = curScore

            feature.addRelational(uid1, uid2, bestScore)
            i += 1
            if i % 1000 == 0:
                print("\tprocessed",
                      i,
                      "of",
                      len(xuidPairs),
                      "(%2.2f)" % float(100.0 * i / len(xuidPairs)),
                      end="\r")

        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
        print("")
Пример #10
0
def onLine():
    analysis = Analysis('../data/all.csv')
    analysis.dataDistribution()
    feature = Feature()
    feature.newFeature()
    feature.categoryNumerical('../data/train.csv', '../data/test.csv')
    #train = TrainAndPredict('../data/rf/train.csv', '../data/rf/validation.csv', '../data/test.csv')
    train = TrainAndPredict('../data/rf/one_hot_train.csv', '../data/rf/one_hot_validation.csv', '../data/test.csv')
    #train.gbdtClassifier()
    #train.gbdtRegressor()
    #train.linearRegression()
    #train.logisticRegression()
    train.xgbost()
Пример #11
0
	def create_features(self, img_height, img_width, min_feature_width, max_feature_width, min_feature_height, max_feature_height):
	    features = []
	    print('Creating feature ...')
	    for feature in FeatureTypes:
	        feature_start_width = max(min_feature_width, feature[0])
	        for feature_width in range(feature_start_width, max_feature_width, feature[0]):
	            feature_start_height = max(min_feature_height, feature[1])
	            for feature_height in range(feature_start_height, max_feature_height, feature[1]):
	                for x in range(img_width - feature_width):
	                    for y in range(img_height - feature_height):
	                        features.append(Feature(feature, (x, y), feature_width, feature_height, 0, 1))
	                        features.append(Feature(feature, (x, y), feature_width, feature_height, 0, -1))
	    print('..done. ' + str(len(features)) + ' features created.')
	    return features
Пример #12
0
def getSample(sentence):
	write_file_1 = open("uni.test.literal","a")
	write_file_2 = open("bi.test.literal","a")
	concept_list = reorder_concepts(sentence.concepts)
	params = []
	for concept in concept_list[0]:
		params.append((concept,[]))
	for concept in concept_list[1]:
		params.append((concept,[]))
	for function in concept_list[2]:
		if function.param_num == 1:
			for param in params:
				params.append((function, [param]))
		else:
			for i in range(0,len(params)):
				for j in range(i+1,len(params)):
					if not function_filter(function, params[i], params[j]):
						params.append((function, [params[i],params[j]]))

	samples = []
	for predicate in concept_list[3]:
		if predicate.param_num == 1:
			for param in params:
				samples.append((predicate, [param])) # lack [] initially
		else:
			for i in range(0,len(params)):
				for j in range(i+1,len(params)):
					if not binary_filter(predicate, params[i], params[j]):
						samples.append((predicate,[params[i],params[j]]))
					if order_matter(predicate):
						if not binary_filter(predicate, params[j], params[i]):
							samples.append((predicate,[params[j],params[i]]))

	for sample in samples:
		print sentence.text
		print sample[0].name+"(",
		if len(sample[1]) == 1:
			print sample[1][0][0].name+  "-" +str(sample[1][0][0].token_id) +")"
			sam = Feature([sample[0],sample[1][0][0]], sentence)
			features = sam.generateFeature()
			write_file_1.write(sentence.text+"\n"+sample[0].name + "-" + str(sample[0].token_id) + "(" + sample[1][0][0].name+  "-" +str(sample[1][0][0].token_id) +")\t"+convert_features(features)+"\n")

		else:
			print sample[1][0][0].name + "-" + str(sample[1][0][0].token_id) +","+sample[1][1][0].name + "-" + str(sample[1][1][0].token_id)+")"
			sam = Feature([sample[0],sample[1][0][0],sample[1][1][0]], sentence)
			features = sam.generateFeature()
			write_file_2.write(sentence.text+"\n"+sample[0].name+ "-" + str(sample[0].token_id)+"("+sample[1][0][0].name + "-" + str(sample[1][0][0].token_id) +","+sample[1][1][0].name + "-" + str(sample[1][1][0].token_id)+")\t"+convert_features(features)+"\n")

		#print sample, features
		print features
Пример #13
0
    def parse_all_docs(self):
        fob = Feature(self.exp)

        cob = Category()
        cob.get_category_done_list()

        stime = time.time()
        print "Parsing documents in " + self.exp
        print "Start time: " + time.ctime(stime)

        with open(self.listing_document_path, 'r') as f:
            lines = [line.strip() for line in f]
            lines = [line for line in lines[2:] if line]

            document_id = ''
            for line in lines:
                elements = line.split(' ')
                if elements[0] == '#':
                    document_id = elements[3]
                    continue

                c_id, category = self._get_category(elements[0])

                if category == 'NA':
                    continue

                if c_id in cob.category_done_list and \
                  not self.ignore_duplicate_category:
                    continue

                sample = self._get_features(fob, elements[1:])
                sample['Category'] = category
                sample['Id'] = document_id

                self.samples = self.samples.append(sample, ignore_index=True)

                print "Document #" + document_id + " parsed"

            self.samples = self.samples.fillna(0)

        etime = time.time()
        print "Documents in " + self.exp + ' parsed'
        print "End time: " + time.ctime(etime)
        print "Time taken: " + str(etime - stime) + " seconds"

        cob.update_category_done_list([self.category_id1, self.category_id2])

        fob.destroy_list()
Пример #14
0
def add_alters_to_ego_net(ego_net, alter_features_file, ego_net_features):
    '''
    ego_net: object of EgoNet class
    alter_features_file: file the user inputs
    ego_net_features: used to access feature dictionary
    function splits information in file
    gets the feature name and value using the different classes
    calls add_feature function from Node class and adds that to a node
    uses add_alter_node function from EgoNet class using ego_net object
    returns the ego_net object
    '''
    for line in alter_features_file:  #goes through each line in the file
        pieces = line.split()  #splits and creates list with information
        node_id = int(pieces[0])  #gets the id
        new_node = Node(node_id, len(
            pieces[1:]))  #calls Node class using id and other information

        for i, j in enumerate(
                pieces[1:]
        ):  #uses enumerate to get index and value of the information
            feature_name = ego_net_features[i][1]  #gets feature name
            feature_value = ego_net_features[i][0]  #gets feature value
            feature_object = Feature(
                feature_name, feature_value,
                int(j))  #calls Feature class to get information
            new_node.add_feature(
                i, feature_object
            )  #calls the add_feature funtion from the Node class and adds that to a node
        ego_net.add_alter_node(
            new_node
        )  #uses add_alter_node function from EgoNet class using ego_net object
    return ego_net  #returns the ego_net object
Пример #15
0
def add_alters_to_ego_net(ego_net, alter_features_file, ego_net_features):
    '''
    Iterates through each line in the features_file using for loop
        Splits each line into a list separated by spaces
        Isolates the alter_id, and the alter values in the line_list
        Creates a Node object using the alter_id and the number of features
        For each value in the alter_values
            Use the alter add_feature method to add features to that alter
        Add the node/alter to the ego_net
    Returns: ego_net
    '''
    #Iterates through each line in the feature_fil;e
    for line in alter_features_file:
        #Splits line into a list
        a_list = line.split()
        #Isolates values
        alter_id = int(a_list[0])
        line_list = a_list[1:]
        #Creates an node object and assigning it to alter
        alter = Node(alter_id, len(line_list))
        #Iterates through each value in the alter_values list
        for i, digit in enumerate(line_list):
            # in order to add a feature we must create a Feature instance
            alter.add_feature(
                i,
                Feature((ego_net_features[i][1]), ego_net_features[i][0],
                        int(digit)))
        #Add the alter to the ego_net
        ego_net.add_alter_node(alter)
    return ego_net
Пример #16
0
 def train_opinion_tokens():
     comment_cnt_dict = {}
     token_df_dicts = {}
     processed_file_cnt = 0
     for root, dir, files in os.walk(Config.short_comment_path):
         for file_name in files:
             douban_id = file_name
             comments = DoubanComment.get_comments(douban_id)
             for comment in comments:
                 rating = comment['rating']
                 if rating == 50 or rating == 10:
                     comment_cnt_dict[rating] = \
                             comment_cnt_dict.get(rating, 0) + 1
                     token_df_dicts[rating] = token_df_dicts.get(rating, {})
                     valid_tokens_set = Feature.get_valid_tokens(\
                             comment['comment'])
                     for token in valid_tokens_set:
                         token_df_dicts[rating][token] = \
                                 token_df_dicts[rating].get(token, 0) + 1
             processed_file_cnt += 1
             print 'processed %s files' % processed_file_cnt
             # if processed_file_cnt >= 1000:
             #     break
     for rating, comment_cnt in comment_cnt_dict.items():
         token_df_dict = token_df_dicts[rating]
         token_df_list = sorted(token_df_dict.items(), key=lambda x: -x[1])
         output_obj = open(os.path.join(Config.opinion_path, \
                 str(rating)), 'w')
         output_obj.write('%s\n' % comment_cnt)
         for token, df in token_df_list:
             output_obj.write('%s\t%s\n' % (token.encode('utf8'), df))
Пример #17
0
    def add_feature(self, feature, **kwargs):
        """ add_feature(self, feature, **args)

            o feature       Bio.SeqFeature object

            o **kwargs      Keyword arguments for Feature.  Named attributes
                            of the Feature
                                                        

            Add a Bio.SeqFeature object to the diagram (will be stored
            internally in a Feature wrapper
        """
        id = self.next_id  # get id number
        self.features[id] = Feature(self, id, feature)  # add feature
        for key in kwargs:
            if key == "colour" or key == "color":
                #Deal with "colour" as a special case by also mapping to color.
                #If Feature.py used a python property we wouldn't need to call
                #set_color explicitly.  However, this is important to make sure
                #every color gets mapped to a colors object - for example color
                #numbers, or strings (may not matter for PDF, but does for PNG).
                self.features[id].set_color(kwargs[key])
                continue
            setattr(self.features[id], key, kwargs[key])
        self.next_id += 1  # increment next id
Пример #18
0
    def prepare(self, scales):
        # const vector<Size>& scales
        # Initialize test locations for features
        totalFeatures = self.nstructs * self.structSize
        for i in range(len(scales)):
            tmp = []
            self.features.append(tmp)

        for i in range(totalFeatures):
            x1f = random.random()
            x2f = random.random()
            y1f = random.random()
            y2f = random.random()
            for j in range(len(scales)):
                # scales[j][0] = width, scales[j][1] = height
                x1 = x1f * scales[j][0]
                y1 = y1f * scales[j][1]
                x2 = x2f * scales[j][0]
                y2 = y2f * scales[j][1]
                self.features[j].append(Feature(x1, y1, x2, y2))

        # Thresholds
        self.thrN = 0.5 * self.nstructs

        # Initialize Posteriors
        # positives = Pcounter, negatives = Ncounter
        for i in range(self.nstructs):
            self.posteriors.append([0] * pow(2, self.structSize))
            self.pCounter.append([0] * pow(2, self.structSize))
            self.nCounter.append([0] * pow(2, self.structSize))
Пример #19
0
    def __scaffold_contigs(self, contig_ids=None):
        seq = str(self.get_original_seq()).upper()
        s_id = self.get_name()
        slen = len(seq)

        i = c_start = 0

        contig_count = 0
        value = 1
        id = None
        last_contig = 0

        while True:
            i = seq.find('N', i)
            if i < 0: break
            # count consecutive Ns
            n_start = i
            while i < slen and seq[i] == 'N':
                i += 1
            # this many Ns in a row constitute a contig break (gap)
            n_len = i - n_start
            if n_len >= self.minGapSize:
                c_len = n_start - c_start
                if c_len >= self.minConSize:
                    id = s_id + "_c" + str(contig_count + 1)
                    if contig_ids:
                        id = contig_ids[contig_count]
                    self.contigs.append(Feature(c_start, n_start, value, id))
                    contig_count += 1
                    last_contig = n_start
                elif contig_count == 0:
                    self.seq_start = i
                c_start = i
                #contig_count += 1

        if last_contig < slen:
            if slen - c_start > self.minConSize:
                id = s_id + "_c" + str(contig_count + 1)
                if contig_ids:
                    id = contig_ids[contig_count]
                self.contigs.append(Feature(c_start, slen, value, id))
            else:
                self.seq_end = last_contig

        self.get_contig_lengths_list()
        assert self.get_contig_length() + self.get_gap_length(
        ) == self.get_length()
Пример #20
0
 def __init__(self, symbole, **traits):
     self.__symbole = symbole
     self.__traits = set(map(lambda x: Feature(x[0], x[1]), traits.items()))
     recup = self.__memory.get(symbole)
     if recup == traits:
         print('Cette combinaison traits-symbole existe déjà.')
     else:
         self.__memory[symbole] = traits
Пример #21
0
    def movie_genome_sim(douban_id, genome_id, movie_tf_idf_path, \
            genome_tf_idf_path):
        movie_tf_idf_dict = dict(Feature.get_tf_idf_from_file(douban_id, \
                movie_tf_idf_path))
        genome_tf_idf_dict = dict(Genome.get_tf_idf_from_file(genome_id, \
                genome_tf_idf_path))

        return Tagging.cos_sim([movie_tf_idf_dict, genome_tf_idf_dict])
Пример #22
0
 def tf_2_tfidf(tf_dict):
     idf_dict = Feature.get_idf_dict()
     if len(tf_dict) > 0:
         tf_idf_list = map(lambda x: (x[0], idf_dict[x[0]]*x[1]), \
                 tf_dict.items())
         sorted_list = sorted(tf_idf_list, key=lambda x: -x[1])
         return dict(sorted_list[:100])
     else:
         return {}
Пример #23
0
    def _feature(self, i):
        def context(input):
            line = input.words[input.index][2]
            inputId = (input.input.path, line[0])
            if inputId != self.cachedInputId:
                raise Exception('Unexpected call to feature')
            return self.cachedScopeChain[self.cachedDepth - 1 - i]

        return Feature('{}scope'.format(i), context, word)
Пример #24
0
 def __init__(self,csvFileName):
     self.csvFileName = csvFileName
     self.table = []
     df = pd.read_csv(csvFileName)
     t_flag = 1
     index= []
     for i in range(1,len(df)+1):
         index.append(i)
     for col in df.columns:
         if df[col].dtype == "float64" or df[col].dtype == "int64":
             f = Feature(col,df[col].values)
             self.table.append(f)
             b = f.getSampels() == index
             if b.all():
                 t_flag = 0            
     if t_flag:
         f = Feature("TimeStamp",index)
         self.table.append(f)
Пример #25
0
def preprocess(paths, classes):
    f = Feature()
    X = []
    y = []
    start = time.clock()
    for index, path in enumerate(paths):
        print('Preprocessing', index, path)
        ar = f.getFeature(path)
        if ar.all() == 0:
            continue
        X.append(ar)
        y.append(classes[index])

    test_time = time.clock() - start
    print("Preprocessing Total time: {0:.2f}".format(test_time))
    X = np.array(X)
    y = np.array(y)
    return X, y
Пример #26
0
    def LoadDigitData(self, file_path):
        feature = []
        for line in open(file_path):
            line = line.strip()
            line_feature = [ord(ch) - ord('0') for ch in line]
            feature.extend(line_feature)

        self.dim = len(feature)
        return Feature(np.array(feature))
Пример #27
0
def create_feature(feature_description):
    feature_name = feature_description[0]
    if "{" in feature_description[1] and "}" in feature_description[1]:
        feature_type = "CATEGORICAL"
    else:
        feature_type = "NUMERIC"
    feature_possible_values = (feature_description[1].replace("{", "").replace("}", "")).split(",")
    feature = Feature(feature_name, feature_type, feature_possible_values)
    # append feature to features_list
    features_list.append(feature)
Пример #28
0
def getVisualVector(imset):
    try:
        from Feature import Feature, FeatureType
        feature = Feature.factory(type=FeatureType.GIST, im_set=[imset])
        feature.process()

        return feature.results
    except Exception as e:
        print(e)
        return None
def decorate_wall(obj, options):

    if options.options.windows == "window_line":
        spaced_points = vg.extrude(
            obj.bottom(), Map(spacing=V3(0, math.ceil(obj.height / 2), 0)))
        for vec in spaced_points:
            obj.features.append(Feature("window", vec,
                                        options=options.options))

    elif options.options.windows == "window_line_double":
        spaced_points = vg.extrude(
            obj.bottom(), Map(spacing=V3(0, math.ceil(obj.height / 2), 0)))
        spaced_points2 = vg.extrude(spaced_points, Map(spacing=V3(0, 1, 0)))
        for vec in spaced_points + spaced_points2:
            obj.features.append(Feature("window", vec,
                                        options=options.options))

    elif options.options.windows == "window_slits":
        spaced_points = vg.points_spaced(obj.bottom(), Map(every=5))
        spaced_points = vg.extrude(
            spaced_points, Map(spacing=V3(0, math.ceil(obj.height / 2), 0)))
        spaced_points2 = vg.extrude(spaced_points, Map(spacing=V3(0, 1, 0)))
        for vec in spaced_points + spaced_points2:
            obj.features.append(Feature("spacing", vec))

    else:
        spaced_points = vg.points_spaced(obj.bottom(), Map(every=3))
        spaced_points = vg.extrude(
            spaced_points, Map(spacing=V3(0, math.ceil(obj.height / 2), 0)))
        for vec in spaced_points:
            obj.features.append(Feature("window", vec,
                                        options=options.options))

    mid_points = vg.middle_of_line(obj.bottom(),
                                   Map(center=True, max_width=2, point_per=10))
    for vec in mid_points:
        obj.features.append(
            Feature(
                "door", vec,
                Map(cardinality=obj.cardinality,
                    door_inside=options.options.door_inside)))

    return obj
    def _get_feature(self, text_obj):
        """Get feature data for a whole text object."""
        try:
            for relation in text_obj.relations:
                if relation.is_event_event():
                    f = Feature(relation, strings_cache_g, nlp_persistence_obj_g, duration_cache_g, discourse_cache_g, features_event_event_g)
                    feature = f.get_feature()
                    relation.set_feature(feature)

                elif relation.is_event_timex():
                    f = Feature(relation, strings_cache_g, nlp_persistence_obj_g, duration_cache_g, discourse_cache_g, features_event_timex_g)
                    feature = f.get_feature()
                    relation.set_feature(feature)

                # Append feature to relation in text_obj.relations_plain if existant
                if relation.is_event_event() or relation.is_event_timex():
                    if relation in text_obj.relations_plain:
                        # Search for relation
                        for rel in text_obj.relations_plain:
                            if rel == relation:
                                rel.set_feature(feature)
                                break

            # Print progress
            with _counter_lock:
                _counter.value += 1

                sys.stdout.write("\r%d%%" % int(_counter.value*100/(_length - 1)))
                sys.stdout.flush()

            return text_obj

        except Exception as e:
            # Print progress
            with _counter_lock:
                _counter.value += 1

                sys.stdout.write("\r%d%%" % int(_counter.value*100/(_length - 1)))
                sys.stdout.flush()

            print e
            print traceback.format_exc()
Пример #31
0
 def cal_tf(genome_id):
     genome_movie_dict = Genome.load_genome_movie()
     movie_set = genome_movie_dict.get(genome_id, set())
     #print '%s have %s movies' % (genome_id, len(movie_set))
     genome_tf_dict = {}
     for douban_id in movie_set:
         movie_tf_dict = Feature.get_tf_from_file(douban_id)
         #print '%s have %s terms' % (douban_id, len(movie_tf_dict))
         for term, freq in movie_tf_dict.items():
             genome_tf_dict[term] = genome_tf_dict.get(term, 0) + freq
     return genome_tf_dict
Пример #32
0
 def __init__(self, buffer, language):
     """
     Constructor.
     
     buffer -- the associated TextBuffer
     language -- the spell checking language
     """
     Feature.__init__(self, buffer)
     if type(language) != type([]):
         language = [language]
     self.dicts         = []
     self.changed_lines = []
     self.tag           = buffer.create_tag('incorrect',
                                            underline  = pango.UNDERLINE_SINGLE,
                                            foreground = 'red')
     buffer.connect('insert-text',  self._on_buffer_insert_text)
     buffer.connect('delete-range', self._on_buffer_delete_range_after)
     buffer.connect('changed',      self._on_buffer_changed)
     for lang in language:
         self.dicts.append(enchant.Dict(lang))
Пример #33
0
def add_ego_net_features_to_ego(ego, ego_feature_file, ego_net_features):
    '''Reads a one-line file of features for the ego node'''
    line_list = ego_feature_file.readline().split()  # read one line
    # i is the index, digit is the value
    for i, digit in enumerate(line_list):
        # in order to add a feature we must create a Feature instance
        ego.add_feature(
            i,
            Feature(ego_net_features[i][1], ego_net_features[i][0],
                    int(digit)))
    return ego
Пример #34
0
 def get_sentences(self, seg_result):
     sentences = []
     start = 0
     end = 0
     words = seg_result['ret']
     while end < len(words):
         sentence, end = self.get_first_sentence(words, start)
         if len(Feature.get_verb_noun(sentence)) > 0:
             sentences.append(sentence)
         start = end
     return sentences
 def __init__(self, buffer, language):
     """
     Constructor.
     
     buffer -- the associated TextBuffer
     language -- the spell checking language
     """
     Feature.__init__(self, buffer)
     if type(language) != type([]):
         language = [language]
     self.dicts = []
     self.changed_lines = []
     self.tag = buffer.create_tag('incorrect',
                                  underline=pango.UNDERLINE_SINGLE,
                                  foreground='red')
     buffer.connect('insert-text', self._on_buffer_insert_text)
     buffer.connect('delete-range', self._on_buffer_delete_range_after)
     buffer.connect('changed', self._on_buffer_changed)
     for lang in language:
         self.dicts.append(enchant.Dict(lang))
Пример #36
0
def main():
    train_email_data = EmailData()
    train_email_data.load_from_file('data/train')

    feature = Feature()
    feature.learn(train_email_data)
    train_data_set = feature.translate_email_data(train_email_data)

    #print(feature.features)

    naive_bayesian = NaiveBayesian()
    naive_bayesian.learn(feature, train_data_set)

    test_email_data = EmailData()
    test_email_data.load_from_file('data/test')
    test_data_set = feature.translate_email_data(test_email_data)

    print('# Training set')
    test(naive_bayesian, train_data_set)
    print('# Testing set')
    test(naive_bayesian, test_data_set)
Пример #37
0
 def __init__(self, buffer):
     """
     Constructor.
     
     buffer -- the associated TextBuffer
     """
     Feature.__init__(self, buffer)
     self.bullet_point = u'•'
     self.lock_signals = None
     self.start_tag = buffer.create_tag('list-start',
                                        #foreground  = 'lightblue',
                                        left_margin = 30,
                                        pixels_above_lines = 12)
     self.bullet_tag = buffer.create_tag('list-bullet',
                                         #background  = 'orange',
                                         left_margin = 30)
     self.list_tag   = buffer.create_tag('list',
                                         #underline = pango.UNDERLINE_SINGLE,
                                         left_margin        = 30,
                                         pixels_above_lines = 3)
     buffer.connect_after('insert-text',  self._on_buffer_insert_text_after)
     buffer.connect('delete-range',   self._on_buffer_delete_range)
     buffer.connect('mark-set',       self._on_buffer_mark_set)
Пример #38
0
						phage_as_gta.append(testNames[r])
					else: #gta as virus
						gta_as_phage.append(testNames[r])

	# if not MINI:
	# 	print("\nPhages (%d) misclassified over %d reps: %s" % (len(phage_as_gta), nrep, phage_as_gta))
	# 	print("\nGTA (%d) misclassified over %d reps: %s\n" % (len(gta_as_phage), nrep, gta_as_phage))

	return (score0/nrep, score1/nrep)

if __name__ == '__main__':
	# Load profiles
	gta_profs = Loader.load(GTA_PATH, "GTA")
	viral_profs = Loader.load(VIRAL_PATH, "virus")
	# Make features
	feats = Feature(gta_profs.profiles + viral_profs.profiles)
	# kmer
	feats.make_kmer_dict(K)
	feats.kmer_feat()
	# pseaac
	feats.pseaac(lam=LAM, weight=PSE_WEIGHT)
	# physicochem
	feats.physicochem()

	# Xval
	# predictor = KNeighborsClassifier(n_neighbors=10)
	predictor = MultinomialNB()
	result = xval(predictor, gta_profs, viral_profs, NFOLDS, NREPS)
	if MINI:
		print("GTA Correct\tViral Correct")
		print("%.2f\t%.2f" % (result[0], result[1]))
Пример #39
0
 def setPublishDate(self,d): self._publishDate = d if Feature._vDate(d) else None
 
 def setChangeId(self, changeId): 
Пример #40
0
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding 
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format(
                    arguments.alignment,
                    feature.get_samtools_region())])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(samline,
                                                                        Feature.chromosome_conversion.values(),
                                                                        arguments.count_method,
                                                                        arguments.uniquely_mapping,
                                                                        arguments.ignore_strand,
                                                                        arguments.count_secondary_alignments,
                                                                        arguments.count_failed_quality_control,
                                                                        arguments.count_PCR_optical_duplicate,
                                                                        arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(read, arguments.count_method, arguments.count_splicing,
                                                   arguments.count_partial_reads, arguments.ignore_strand)

                    # output the resulting metagene
                    with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file:
                        output_file.write(
                            "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable)))

                else:
                    raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format(
                        feature.get_chromosome_region(),
                        feature.name,
                        arguments.alignment))
Пример #41
0
if __name__ == '__main__':
	start = time.time()

	# Get args
	parser = get_args()
	args = parser.parse_args()
	# Print detail
	mini = args.mini
	### Load training set and make features ###
	gta_file = args.gta[0]
	virus_file = args.virus[0]
	# Load profiles
	gta_profs = Loader.load(gta_file, "GTA")
	viral_profs = Loader.load(virus_file, "virus")
	# Make features
	feats = Feature(gta_profs.profiles + viral_profs.profiles)
	if args.kmer == None:
		kmer_size = args.kmer
		feats.make_kmer_dict(kmer_size)
		feats.kmer_feat()
	if args.pseaac == None:
		feats.pseaac(lam=int(args.pseaac), weight=PSE_WEIGHT)
	if args.physico:
		feats.physicochem()

	if args.kmer == None and args.pseaac == None and not args.physico:
		print("You must specify at least one feature type (-k, -p, -y).")

	else:
		# Weight if needed
		if args.weight:
Пример #42
0
def evaluate(ans, res):
    total = 0.0
    for i in range(len(ans)):
        total += math.fabs(ans[i] - res[i])
    return total

if __name__ == '__main__':
    # Parses arguments
    if len(sys.argv) != 3 or (sys.argv[1] not in ['pca', 'ae']):
        print ("Usage:", sys.argv[0], "[dim reduce method (pca/ae)] [dimension]")
        sys.exit(1)

    DIM = int(sys.argv[2])

    feature = Feature()

    train_X, train_Y = feature.getYearFeatures(2015)
    test_X, test_Y = feature.getYearFeatures(2010)
    print ("All data prepared.")

    train_X_reduce = None
    test_X_reduce = None
    if sys.argv[1] == 'pca':
        pca = PCA(n_components = DIM)
        train_X_reduce = np.concatenate((
            pca.fit_transform(np.array([ x[0]+x[1] for x in train_X ])),
            np.array([ x[2] for x in train_X ])
        ), axis=1)
        # Applies same model on test data.
        test_X_reduce = np.concatenate((
Пример #43
0
def setup():
    work = get_work()
    config = Config()
    snappy = Snappy()
    feature = Feature()

    globalLength = config.getElementLength()
    
    for fan in config.getFans():
        snappy.addFan(fan)
        
    for blank in config.getBlanks():
        snappy.addBlank(blank)
           
    for baffle in config.getBaffles():
        snappy.addBaffle(baffle)
           
    for solid in config.getSolids():
        snappy.addSolid(solid)
        
    for refinementRegion in config.getRefinementRegions():
        snappy.addRefinementRegion(refinementRegion)
        
    for geom in config.getSolids() + config.getFans() + config.getBlanks() + config.getBaffles():
        localLength = config.getElementLength(geom)
        refinementLevel = calcRefinementLevel(globalLength, localLength)
        snappy.setRefinement(refinementLevel, geom)
        feature.addGeom(geom)

    for geom in config.getRefinementRegions():
        localLength = config.getElementLength(geom)
        refinementLevel = calcRefinementLevel(globalLength, localLength)
        snappy.setRegionRefinement(refinementLevel, geom)
        
    
    boundingBox = config.getBoundingBox()
    dist = config.getBoundingBoxDistance()

    fluidBoundaries = ([x - dist for x in boundingBox[0:3]] + 
                       [x + dist for x in boundingBox[3:6]])

    location = [x - .00111 for x in fluidBoundaries[3:6]]
    snappy.setLocation(location)

    bmName = os.path.join(work.polyMeshDir(), "blockMeshDict")

#    template=TemplateFile(bmName + ".template")
#    template.writeToFile(bmName, {'minx': fluidBoundaries[0],
#                                  'miny': fluidBoundaries[1],
#                                  'minz': fluidBoundaries[2],
#                                  'maxx': fluidBoundaries[3],
#                                  'maxy': fluidBoundaries[4],
#                                  'maxz': fluidBoundaries[5],
#                                  'size': globalLength})

    minx = fluidBoundaries[0]
    miny = fluidBoundaries[1]
    minz = fluidBoundaries[2]
    maxx = fluidBoundaries[3]
    maxy = fluidBoundaries[4]
    maxz = fluidBoundaries[5]    

    blockMesh = ParsedParameterFile(bmName)
    blockMesh["vertices"] = [
        "(%.6f %.6f %.6f)" % (minx, miny, minz),
        "(%.6f %.6f %.6f)" % (maxx, miny, minz),
        "(%.6f %.6f %.6f)" % (maxx, maxy, minz),
        "(%.6f %.6f %.6f)" % (minx, maxy, minz),
        "(%.6f %.6f %.6f)" % (minx, miny, maxz),
        "(%.6f %.6f %.6f)" % (maxx, miny, maxz),
        "(%.6f %.6f %.6f)" % (maxx, maxy, maxz),
        "(%.6f %.6f %.6f)" % (minx, maxy, maxz)]
    
    numx = int((maxx-minx)/globalLength)
    numy = int((maxy-miny)/globalLength)
    numz = int((maxz-minz)/globalLength)
    
    blockMesh["blocks"][2] = "(%d %d %d)" % (numx, numy, numz)
    blockMesh.writeFile()
Пример #44
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Baseline method for MD final project.'
    )
    parser.add_argument('--year', '-y', help='Year to test',
                        dest='year', type=int, required=True)
    parser.add_argument('--node-threshold', '-n', help='Node features # threshold',
                        dest='node_threshold', type=int, default=150)
    parser.add_argument('--migration-threshold', '-m',
                        help='Migrants non-zero # threshold',
                        dest='migrat_threshold', type=int, default=40)

    args = parser.parse_args(sys.argv[1:])

    feature = Feature(args.node_threshold, args.migrat_threshold)

    ans = baseline(feature, args.year)

    # Getting real answer.
    real = feature.getValidation(args.year)
    real_ans = []
    for tar, iy in feature.country_index.items():
        for src, ix in feature.country_index.items():
            real_ans.append(real[(src, tar)])

    N_country = len(feature.country_index)

    error = evaluate(real_ans, ans)
    print ("Baseline error:", error)
    print ("Average error:", error / len(real_ans))
Пример #45
0
 def __init__(self, *args, **kwargs):
     Feature.__init__(self)
     self.persons = kwargs['persons']
Пример #46
0
import numpy as np
import sys
#sys.path.append('data/validation')
from Feature import Feature
from pca import DR_PCA
from math import sqrt, fabs
from cvxopt import matrix, spmatrix, solvers
import AE

dim = input('dim = ')
migration_threshold = input('migration_threshold = ')
op = input('type 0 to use PCA, 1 to use AE : ')
f = Feature(node_threshold=180, migration_threshold=migration_threshold)
X, Y = f.getYearFeatures(2015)
X0 = []
X1 = []
for i in range(len(X)):
    X0.append(X[i][0])
    X1.append(X[i][1])
if op == 0:
    x0 = DR_PCA(X0, dim)
    x1 = DR_PCA(X1, dim)
else:
    X0 = np.array(X0)
    X1 = np.array(X1)
    x, w, b = AE.dim_reduce(X0, dim, 2, 20, 0.01)
    x0 = AE.forward2hidden(X0, w, b, 2)
    x, w, b = AE.dim_reduce(X1, dim, 2, 20, 0.01)
    x1 = AE.forward2hidden(X1, w, b, 2)
x = []
for i in range(len(X)):
Пример #47
0
 def __init__(self, *args, **kwargs):
   Feature.__init__(self)
   self.stemmer = Stemmer('english')
   self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
   self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
Пример #48
0
 def setEmail(self, email):
     self._email = email if Feature._vEmail(email) else None
Пример #49
0
 def __init__(self, *args, **kwargs):
   Feature.__init__(self)
Пример #50
0
def setup():
    """Create fixtures"""

    # Define chromosome sizes
    Read.extract_chromosome_sizes(["@HD\tVN:1.0\tSO:unsorted",
                                   "@SQ\tSN:chr1\tLN:300",
                                   "@SQ\tSN:chr2\tLN:200",
                                   "@PG\tID:test\tVN:0.1"])
    Feature.process_set_chromosome_conversion(["1\tchr1",
                                               "2\tchr2"])

    good_input["bed input counting all of the read"] = ("all",
                                                        "[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]")
    good_input["bed input counting start of the read"] = ("start",
                                                          "[17, 18, 19, 20, 21, 22, 23]")
    good_input["bed input counting end of the read"] = ("end",
                                                        "[36, 37, 38, 39, 40, 41, 42]")
    good_input["gff input counting all of the read"] = ("all",
                                                        "[43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8]")
    good_input["gff input counting start of the read"] = ("start",
                                                          "[43, 42, 41, 40, 39, 38, 37]")
    good_input["gff input counting end of the read"] = ("end",
                                                        "[14, 13, 12, 11, 10, 9, 8]")


    for method in ['all', 'start', 'end']:
        print "\nTesting feature_count option: ****{}****".format(method)

        if method == 'all':
            metagene = Metagene(10, 4, 2)
            print "\t  with Metagene:\t{}".format(metagene)
            print "\t  with chromosome conversions:\t{}".format(Feature.chromosome_conversion)
        else:
            metagene = Metagene(1, 4, 2)
            print "\t  with Metagene:\t{}".format(metagene)
            print "\t  with chromosome conversions:\t{}".format(Feature.chromosome_conversion)


        # create feature from BED line
        try:
            bedline = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, "+")
            print "\t  with BED line:\t{}".format(bedline.strip())
            feature1 = Feature.create_from_bed(method, metagene, bedline, False, False)
            if str(feature1.position_array) != correct_features['bed'][method]:
                print "**FAILED**\t  Create Feature from BED line ?"
                print "\t  Desired positions:\t{}".format(correct_features['bed'][method])
                print "\t  Created positions:\t{}".format(feature1.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from BED line ?"
        else:
            print "PASSED\t  Create Feature from BED line ?\t\t{}".format(feature1.get_chromosome_region())

        # create feature from GFF line
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 10, 39, ".", "-", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline, False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Create Feature from GFF line ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(feature2.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from GFF line ?"
        else:
            print "PASSED\t  Create Feature from GFF line ?\t\t{}".format(feature2.get_chromosome_region())

        # create feature from GFF line with start and end swapped
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 39, 10, ".", "-", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline, False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Create Feature from GFF line with swapped start and end ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(feature2.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from GFF line with swapped start and end ?"
        else:
            print "PASSED\t  Create Feature from GFF line with swapped start and end ?\t\t{}".format(
                feature2.get_chromosome_region())
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 39, 10, ".", "+", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline, False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(feature2.position_array)
        except MetageneError as err:
            print "PASSED\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format(
                err)
        else:
            print "**FAILED**\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format(
                feature2.get_chromosome_region())


        ##TODO finish complete testing of Feature class
    print "\n##TODO finish testing of Feature class creation\n"

    print "\n**** Testing counting and maniputlation ****\n"

    expected = {'all': {}, 'start': {}, 'end': {}}
    #  Positions in metagene:                           17    18     19   20  21-22,23-24,25-26,27-28,29-30,31-32,33-34,35-36,37-38,39-40,  41,   42
    expected['all'] = {
    'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000,0.000,0.286,0.571,0.571,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.100,0.100,0.100,0.100,0.100,0.000,0.000,0.000,0.000,0.000,0.111",
    'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000",
    'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000"}
    #  Positions in metagene:                           17    18    19    20   [21]   22    23
    expected['start'] = {
    'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.050",
    'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000",
    'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.500"}
    #  Positions in metagene:                           36    37    38    39   [40]   41    42
    expected['end'] = {
    'all': "first,sense:allreads,0.000,0.000,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.111",
    'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000",
    'end': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,1.000"}

    metagene = {'all': Metagene(10, 4, 2),
                'start': Metagene(1, 4, 2),
                'end': Metagene(1, 4, 2)}

    for method in ['all', 'start', 'end']:
        if method == 'all':
            print "\t  with Metagene:\t{}".format(metagene[method])
            print "\t  with chromosome conversions:\t{}".format(Feature.chromosome_conversion)
        else:
            print "\t  with Metagene:\t{}".format(metagene[method])
            print "\t  with chromosome conversions:\t{}".format(Feature.chromosome_conversion)

        print "\nTesting feature_count option: ****{}****".format(method)
        feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, "+")
        feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False)
        print "\tFeature:\t{}".format(feature1.position_array)

        reads = []
        reads.append(Read("chr1", "+", 3, 1, [10, 11, 12, 13, 14, 15, 16, 17, 18]))
        reads.append(Read("chr1", "-", 1, 2, [23, 24, 25, 26, 27, 28, 29, 30, 31, 32]))
        reads.append(Read("chr1", "+", 4, 2, [30, 31, 32, 33, 34, 40, 41]))
        reads.append(Read("chr1", "-", 1, 1, [42, 43, 44, 45, 46, 47, 48, 49, 50]))

        reads.append(Read("chr1", "+", 10, 1, [51, 52, 53, 54, 55]))
        reads.append(Read("chr2", "+", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25]))

        # starting count
        for count_method in ['all', 'start', 'end']:
            print "\nTesting count_method option: ****{}****".format(count_method)

            output = "{}\n".format(feature1)

            for r in reads:
                output += "{}\n".format(r)
                feature1.count_read(r, count_method, count_partial_reads=True)
                output += "{}\n".format(feature1)

            output += feature1.print_metagene(pretty=True)
            if str(feature1.print_metagene()).strip() == str(expected[method][count_method]).strip():
                print "PASSED\tCreated correct metagene with feature method {} and count method {} ?".format(method,
                                                                                                             count_method)
            else:
                print "**FAILED**\tCreated correct metagene with feature method {} and count method {} ?".format(method,
                                                                                                                 count_method)
                print "\tExpected:\n{}".format(expected[method][count_method])
                print "\tActual  :\n{}".format(feature1.print_metagene())
                print "\tSummary of run:\n{}".format(output)
            feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False,
                                               False)  # zero out counter for next round

    try:
        unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25])
        feature1.count_read(unstranded_read, 'all')
    except MetageneError as err:
        print "PASSED\tCaught unstranded read on stranded count ?\t\t".format(err)
    else:
        print "**FAILED**\tCaught unstranded read on stranded count ?"

    try:
        feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, ".")
        feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False)
        unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25])
        feature1.count_read(unstranded_read, 'all')
    except MetageneError as err:
        print "**FAILED**\tAllowed unstranded read on unstranded count ?\t\t".format(err)
    else:
        print "PASSED\tAllowed unstranded read on unstranded count ?"

    print "\n**** Testing adjust_to_metagene ****\n"

    chromosome_converter = {"1": "chr1", "2": "chr2"}

    # ((metagene_tupple),(feature_tupple),expected_result_string, message_string)
    tests = [((8, 2, 2), (16, 8, 24, 4), '8.000,8.000,4.000,4.000,12.000,12.000,2.000,2.000', "Expand to metagene ?"),
             ((4, 2, 2), (6, 8, 6, 2, 4, 4, 2, 4, 24, 8), '17.000,9.000,8.000,34.000', "Contract to metagene ?"),
             ((4, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '5.500,9.333,17.825,9.475',
              "Contract with messy floats ?"),
             ((3, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '7.611,19.556,14.967',
              "Contract with other messy floats ?")]

    for t in tests:
        metagene = Metagene(*t[0])
        print "\t{}".format(metagene)
        feature_line = "{}\t{}\t{}\n".format(1, 0, len(t[1]))
        feature = Feature.create_from_bed('all', metagene, feature_line, False, False, short=True)
        adjusted_feature = ""
        for f in feature.adjust_to_metagene(t[1]):
            adjusted_feature += "{0:0.3f},".format(f)
        if adjusted_feature[:-1] == t[2]:
            print "PASSED\t{}".format(t[3])
        else:
            print "**FAILED**\t{}".format(t[3])
            print "\tExpected:\t{}".format(t[2])
            print "\tActual  :\t{}".format(adjusted_feature[:-1])
            print "\tOriginal:\t{}".format(feature.adjust_to_metagene(t[1]))

    print "\n**** End of Testing the Feature class ****\n"

# end of Feature.test method