def triple_classifier(tweet): ''' 输出结果: 0 中性 1 积极 2 生气 3 焦虑 4 悲伤 5 厌恶 6 消极其他 ''' sentiment = 0 if isinstance(tweet['text'],unicode): text = tweet['text'] else: text = tweet['text'].decode('utf-8') keywords_list = [] emoticon_sentiment = emoticon(text.encode('utf-8')) if emoticon_sentiment != MIDDLE: entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] keywords_list = entry if emoticon_sentiment == POSITIVE: sentiment = emoticon_sentiment text = u'' else: sentiment = flow_psychology_classfiy(text.encode('utf-8')) if sentiment == 0: sentiment = 6 text = u'' if text != u'': entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] keywords_list = entry bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s2 = [1, 1] for pair in bow: s2[0] = s2[0] * (step2_score[pair[0]][0] ** pair[1]) s2[1] = s2[1] * (step2_score[pair[0]][1] ** pair[1]) if s2[0] > s2[1]: sentiment = POSITIVE else: sentiment = flow_psychology_classfiy(text.encode('utf-8')) if sentiment == 0: sentiment = 6 else: sentiment = MIDDLE return sentiment
def make_audio(location, name, d_csv, start_idx, end_idx): for i in range(start_idx,end_idx): f_name = name + str(i) link = "https://www.youtube.com/watch?v="+d_csv.loc[i][0] start_time = d_csv.loc[i][1] end_time = start_time+3.0 utils.download(location,f_name,link) utils.cut(location,f_name,start_time,end_time) print("\r Process audio... ".format(i) + str(i), end="") print("\r Finish !!", end="")
def cut_text(item): text = item['text'].encode('utf-8') terms_cx = cut(s, text, cx=True) terms = [term for term, cx in terms_cx] item['terms'] = terms item['terms_cx'] = terms_cx return item
def createContainer(self, result): time = '00:' + result['duration'] video_codec = result['video_codec'] input_path = Path.cwd() input_file = str(input_path / "BBB.mp4") subtitles_file = str(input_path / "subtitles.srt") output_path = Path.cwd() / "Containers" / result['name'] output_path.mkdir(parents=True, exist_ok=True) output_path_container = Path.cwd() / "Containers" output_container = str(output_path_container) + \ "/" + result['name'] + ".mp4" output_cut = cut(input_file, output_path, time) output_resize = resize(output_cut, output_path, result['size']) output_mono = mono(output_cut, output_path, result['audio_codec']) output_subtitles = subtitles(subtitles_file, output_path) output_lower_bitrate = lower_bitrate( output_cut, output_path, result['bitrate']) command = f'ffmpeg -i {output_resize} -i {output_mono} -i {output_lower_bitrate} -map 0:v -map 1:a -map 2:a -c:a copy -c:v {video_codec} -vf "ass={output_subtitles}" {output_container}' os.system(command) os.system(f"ffplay {output_container}") return output_container
def transform(self): """ Bestow weights on candidate by derivation """ formulas, arguments, temp = self.fit() print("Get the candidate by derivation") for mark, formula in formulas: feature = self.features[mark] # Derivation x = utils.get(arguments, mark) y = utils.get(temp, mark) y_hat = formula.predict(x) sections = utils.growth(list(y_hat)) candidates, phrase = [], "" for section in sections: for index in section: phrase += feature[str(index)]['value'] candidates.append(phrase) phrase = "" candidates = "".join(utils.cut(candidates, self._constants)) for index, sentence in feature.items(): if sentence["value"] in candidates: sentence["deriv"] = 1.2 if sentence["policy"] > 0 else 0.2 self.features[mark].update(feature) print("done")
def load_test(): ## load word vector with utils.timer('Load word vector'): word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % ( config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr)) ## load train data with utils.timer('Load test data'): test_data, uid_list, info_id_list = utils.load_test_data(test_file) test_data, uid_list, info_id_list = test_data[:int(0.2 * len(test_data))], uid_list[:int(0.2 * len(uid_list))], info_id_list[:int(0.2 * len(info_id_list))] with utils.timer('representation for test'): X_test = [] text_test = [] for i in range(len(test_data)): text = test_data[i] if(text == ''): continue words = utils.cut(text) if(len(words) == 0): continue X_test.append([word2vec.get(w, word2vec['_UNK']) for w in words]) text_test.append(text) del word2vec gc.collect() return X_test, text_test, uid_list, info_id_list
def cut_text(item): text = item['text'].encode('utf-8') # terms_cx = cut(s, text, cx=True) # terms = [term for term, cx in terms_cx] item['terms'] = cut(s, text, cx=False) # item['terms_cx'] = terms_cx return item
def pathway_from_files(name, dirname): path = pathway(name) try: for key, suffix in pathway.components.items(): f = open('%s.%s' % (join(dirname,name),suffix)) path[key] = cut(f) except: raise IOError, 'path error' return path
def pathway_from_files(name, dirname): path = pathway(name) try: for key, suffix in pathway.components.items(): f = open('%s.%s' % (join(dirname, name), suffix)) path[key] = cut(f) except: raise IOError, 'path error' return path
def make_network(topic, date, window_size, max_size=100000, ts=False): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) g = nx.DiGraph() #need repost index topic = cut(s, topic.encode('utf-8')) statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} if ts: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size) else: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size) print 'topic statuses count %s' % count if ts: uid_ts = {} for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] repost_ts = int(status['timestamp']) source_status = acquire_status_by_id(rt_mid) source_uid = source_status['user'] source_ts = int(source_status['timestamp']) if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue if repost_uid not in uid_ts: uid_ts[repost_uid] = repost_ts else: if uid_ts[repost_uid] > repost_ts: uid_ts[repost_uid] = repost_ts if source_uid not in uid_ts: uid_ts[source_uid] = source_ts else: if uid_ts[source_uid] > source_ts: uid_ts[source_uid] = source_ts g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return uid_ts, g else: for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] source_uid = acquire_status_by_id(rt_mid)['user'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def text_tensor(text): words = [w for w in utils.cut(text)] print(words) if(len(words) < 150): words = ['_UNK'] * (150 - len(words)) + words else: words = words[:150] words = [word2vec.get(w, word2vec['_UNK']) for w in words] words = np.asarray(words) sample = words.reshape(1, len(words), config.embedding_size) return sample
def compress_districts(unit,remove_voterfile=False): print 'running compress_districts' from config import locality_name,precinct_name,voterfile_delimiter,reduced_voterfile_name import csv from utils import cut from zipfile import ZipFile district_names = dict((e['name_column'],set()) for e in unit.ed_defs) print 'dist_names: {}'.format(district_names) district_names.update({locality_name:set(),precinct_name:set()}) vf_columns = set(['voterbase_id']+[column for column_tuple in district_names for column in column_tuple if type(column_tuple) == tuple] + [column for column in district_names if type(column) == str]) zfile = ZipFile(unit.UNCOMPRESSED_VOTER_FILE_ZIP_LOCATION) voter_file_name = zfile.namelist()[0] voter_file_full = os.path.join(unit.__path__[0],voter_file_name) if not os.path.exists(voter_file_full): zfile.extract(voter_file_name,unit.__path__[0]) column_indexes = dict((column,idx) for idx,column in enumerate(open(voter_file_full).readline().split(voterfile_delimiter)) if column in vf_columns) extra_district_dicts = {} for k,v in (getattr(unit,'EXTRA_DISTRICTS',None) or {}).iteritems(): edfile = os.path.join(unit.__path__[0],v['filename']) edcsv = csv.reader(open(edfile),delimiter='\t') edcsv.next() extra_district_dicts[k]=dict((l[0],l[v['column']-1]) for l in edcsv) with open(os.path.join(unit.__path__[0],reduced_voterfile_name),'w') as reduced_voterfile: print 'reduced_vf: {}'.format(reduced_voterfile) reduced_voterfile_csv = csv.DictWriter(reduced_voterfile,fieldnames=column_indexes.keys() + extra_district_dicts.keys(),delimiter=voterfile_delimiter) reduced_voterfile_csv.writeheader() for i,line in enumerate(csv.DictReader(cut(voter_file_full,sorted(column_indexes.values()),voterfile_delimiter),delimiter=voterfile_delimiter)): write_line = False for edn,edd in extra_district_dicts.iteritems(): line.update({edn,edd[line['voterbase_id']]}) for name_columns,district_set in district_names.iteritems(): if type(name_columns) == tuple: name = tuple(line[nc] for nc in name_columns) else: name = line[name_columns] if name not in district_set: write_line = True district_set.add(name) if write_line: reduced_voterfile_csv.writerow(line) if i % 100000 == 0: print i if remove_voterfile: os.remove(voter_file_full) with open(os.path.join(unit.__path__[0],'districts.py'),'w') as districts_file: districts_file.write('state="{state}"\n'.format(state=unit.state_key)) for ed in unit.ed_defs: districts_file.write(ed['district_type']+'='+str(district_names[ed['name_column']])+'\n') unit.unit_post_district_trigger()
def normal_tokenize(self, caption): # Convert caption (string) to word ids. tokens = self.tokenizer.word_tokenize(str(caption).lower()) if self.filter: tokens = filter_freq(tokens, self.count, self.n_filter) if self.cut: tokens = cut(tokens, self.n_cut) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) target = torch.Tensor(caption) return target
def preprocessing(qlist): """ 对于qlist做文本预处理操作。 可以考虑以下几种操作: 停用词过滤、转换成lower_case、去掉一些无用的符号、去掉出现频率很低的词:比如出现次数少于10,20.... 对于数字的处理:分词完有些单词可能就是数字,把他们转成"#number" stemming(利用porter stemming) """ qlist_data = utils.load_qlist('data/q_prepro.txt') stopset = set(stopwords.words('english')) minus_words = ["when", "what", "where", "how", "which", "who", "whom"] for i in minus_words: stopset.discard(i) p = PorterStemmer() qlist_ = utils.cut(qlist) word_dic = Counter([q for l in utils.cut(qlist_data) for q in l]) low_freq_words = utils.find_low_freq_word(word_dic) new_list = [] # f = open('data/q_prepro.txt', 'w', encoding='utf-8') for line in qlist_: l = "" for word in line: word = word.lower() #stemming word = p.stem(word) # 去除所有标点符号 word = ''.join(c for c in word if c not in string.punctuation) #数字转成”#number“ if word.isdigit(): word = "#number" #不是停用词且不是低频词,加入新list if word not in low_freq_words and word not in stopset: l += word + " " # f.write(l +'\n') new_list.append(l) # f.close() return new_list
def compress_districts(unit,remove_voterfile=False): from config import locality_name,precinct_name,voterfile_delimiter,reduced_voterfile_name import csv from utils import cut from zipfile import ZipFile district_names = dict((e['name_column'],set()) for e in unit.ed_defs) district_names.update({locality_name:set(),precinct_name:set()}) vf_columns = set(['voterbase_id']+[column for column_tuple in district_names for column in column_tuple if type(column_tuple) == tuple] + [column for column in district_names if type(column) == str]) zfile = ZipFile(unit.UNCOMPRESSED_VOTER_FILE_ZIP_LOCATION) voter_file_name = zfile.namelist()[0] voter_file_full = os.path.join(unit.__path__[0],voter_file_name) if not os.path.exists(voter_file_full): zfile.extract(voter_file_name,unit.__path__[0]) column_indexes = dict((column,idx) for idx,column in enumerate(open(voter_file_full).readline().split(voterfile_delimiter)) if column in vf_columns) extra_district_dicts = {} for k,v in (getattr(unit,'EXTRA_DISTRICTS',None) or {}).iteritems(): edfile = os.path.join(unit.__path__[0],v['filename']) edcsv = csv.reader(open(edfile),delimiter='\t') edcsv.next() extra_district_dicts[k]=dict((l[0],l[v['column']-1]) for l in edcsv) with open(os.path.join(unit.__path__[0],reduced_voterfile_name),'w') as reduced_voterfile: reduced_voterfile_csv = csv.DictWriter(reduced_voterfile,fieldnames=column_indexes.keys() + extra_district_dicts.keys(),delimiter=voterfile_delimiter) reduced_voterfile_csv.writeheader() for i,line in enumerate(csv.DictReader(cut(voter_file_full,sorted(column_indexes.values()),voterfile_delimiter),delimiter=voterfile_delimiter)): write_line = False for edn,edd in extra_district_dicts.iteritems(): line.update({edn,edd[line['voterbase_id']]}) for name_columns,district_set in district_names.iteritems(): if type(name_columns) == tuple: name = tuple(line[nc] for nc in name_columns) else: name = line[name_columns] if name not in district_set: write_line = True district_set.add(name) if write_line: reduced_voterfile_csv.writerow(line) if i % 100000 == 0: print i if remove_voterfile: os.remove(voter_file_full) with open(os.path.join(unit.__path__[0],'districts.py'),'w') as districts_file: districts_file.write('state="{state}"\n'.format(state=unit.state_key)) for ed in unit.ed_defs: districts_file.write(ed['district_type']+'='+str(district_names[ed['name_column']])+'\n') unit.unit_post_district_trigger()
def to_handmodel(hand_crop, direction='up'): handcontour = utils.skindetect(hand_crop) hand = utils.cut(hand_crop, handcontour) handskeleton = utils.skeletonize(hand) fingerlines = utils.linedetect(handskeleton) if direction == 'dn': handmodel = map(lambda l: [l[2], l[3]], fingerlines) else: handmodel = map(lambda l: [l[0], l[1]], fingerlines) if sum([1 for _ in handmodel]) > 4: handmodel = utils.cluster(handmodel, \ value=lambda p: (p[0])**2 + (p[1])**2, \ K=4) combine=lambda p1, p2: [(p1[0]+p2[0])/2, (p1[1]+p2[1])/2] handmodel = map(lambda l: reduce(combine, l), handmodel) return handmodel
def to_handmodel(hand_crop, direction='up'): handcontour = utils.skindetect(hand_crop) hand = utils.cut(hand_crop, handcontour) handskeleton = utils.skeletonize(hand) fingerlines = utils.linedetect(handskeleton) if direction == 'dn': handmodel = map(lambda l: [l[2], l[3]], fingerlines) else: handmodel = map(lambda l: [l[0], l[1]], fingerlines) if sum([1 for _ in handmodel]) > 4: handmodel = utils.cluster(handmodel, \ value=lambda p: (p[0])**2 + (p[1])**2, \ K=4) combine = lambda p1, p2: [(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2] handmodel = map(lambda l: reduce(combine, l), handmodel) return handmodel
def _index_field(field, document, item, schema_version, schema, termgen): prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper() field_name = field['field_name'] # 可选term在pre_func里处理 if field_name in schema['index_item_iter_keys']: term = _marshal_term(item.get(field_name), schema.get('pre_func', {}).get(field_name)) document.add_term(prefix + term) # 可选value在pre_func里处理 elif field_name in schema['index_value_iter_keys']: value = _marshal_value(item.get(field_name), schema.get('pre_func', {}).get(field_name)) document.add_value(field['column'], value) elif field_name == 'text': text = item['text'].encode('utf-8') tokens = cut(s, text) termgen.set_document(document) termgen.index_text_without_positions(' '.join(tokens), 1, prefix)
def put_on(img, el, condition, bg_color=1): # Center of the structural element x_c = el.size[0] // 2 y_c = el.size[1] // 2 # Additional emptiness to add to img along x and y dx_min = x_c dx_max = el.size[0] - x_c - 1 dy_min = y_c dy_max = el.size[1] - y_c - 1 element_pixels = Image2ll_binary(el) image_pixels = expand(Image2ll_binary(img), dx_min, dx_max, dy_min, dy_max, bg_color) new_image_pixels = [[ 1 for idx_y in range(img.size[1] + dy_min + dy_max)] for idx_x in range(img.size[0] + dx_min + dx_max)] black_count = 0 for idx_x in range(dx_min, dx_min + img.size[0]): for idx_y in range(dy_min, dy_min + img.size[1]): if condition == 'any' and image_pixels[idx_x][idx_y] == 0: for dx in range(-dx_min, dx_max + 1): for dy in range(-dy_min, dy_max + 1): if element_pixels[x_c + dx][y_c + dy] == 0: if new_image_pixels[idx_x + dx][idx_y + dy] != 0: black_count += 1 new_image_pixels[idx_x + dx][idx_y + dy] = 0 elif condition == 'all': if image_pixels[idx_x][idx_y] == 0: all_match = True for dx in range(-dx_min, dx_max + 1): for dy in range(-dy_min, dy_max + 1): if (image_pixels[idx_x + dx][idx_y + dy] != 0 and element_pixels[x_c + dx][y_c + dy] == 0): all_match = False break if not all_match: break if all_match: new_image_pixels[idx_x][idx_y] = 0 black_count += 1 print('black:', black_count, 'white:', img.size[0] * img.size[1] - black_count) return ll2Image_binary(cut(new_image_pixels, dx_min, dx_max, dy_min, dy_max))
def embed_watermark(host, watermark, robustness): old_shape = host.shape[0: 2] host = utils.pad(host) height, width = host.shape[0: 2] host = cv.cvtColor(host, cv.COLOR_BGR2YCrCb) luma = host[:, :, 0] luma = luma.astype(np.float) watermark = cv.resize(watermark, (width >> 1, height >> 2)) watermark = cv.adaptiveThreshold(watermark, 255, cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY, 11, 2) watermark, pxl_perm_mat = utils.pseudo_random_permute(watermark) luma = utils.dct(luma) luma = embed_freq_domain(luma, watermark, robustness) luma = utils.idct(luma) host[:, :, 0] = np.clip(luma, 0, 255) host = cv.cvtColor(host, cv.COLOR_YCrCb2BGR) host = utils.cut(host, old_shape) return host, pxl_perm_mat
def load_dataset(test_size=0.2): ## load word vector with utils.timer('Load word vector'): word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % (config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr)) ## load train data with utils.timer('Load train data'): data_1 = utils.load_cs_deleted_data(cs_delete_file_1) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data_4 = utils.load_cs_deleted_data(cs_delete_file_2) print(data_4['label'].value_counts()) data = pd.concat([data_1, data_2, data_3, data_4[data_4['label'] == 1].reset_index(drop=True)], axis=0,ignore_index=True) #data = pd.concat([data_1, data_2, data_3, data_4], axis=0,ignore_index=True) DebugDir = '%s/debug' % config.DataBaseDir if (os.path.exists(DebugDir) == False): os.makedirs(DebugDir) del data_4, data_3, data_2, data_1 gc.collect() ## data representation with utils.timer('representation for train'): # X = [[word2vec.get(w, word2vec['_UNK']) for w in utils.cut(text)] for text in data['text'].values] X = [] y = [] for i in range(len(data)): text = data['text'][i] if(text == ''): continue words = utils.cut(text) if(len(words) == 0): continue X.append([word2vec.get(w, word2vec['_UNK']) for w in words]) y.append(data['label'][i]) del word2vec, data gc.collect() X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size) return X_train, y_train, X_valid, y_valid
def WordRepresentation(texts, word2vec): '''''' with utils.timer('representation'): ## padding X = [] indexes = [] for i in range(len(texts)): text = texts[i] words = utils.cut(text) if (len(words) == 0): continue if (len(words) < maxlen): X.append(['_UNK'] * (maxlen - len(words)) + words) # ahead padding in default mode else: X.append(words[:maxlen]) indexes.append(i) ## word2vec X = np.array([[word2vec.get(w, word2vec['_UNK']) for w in wv] for wv in X]) return X, indexes
def test(): logging.basicConfig( level=logging.INFO, format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") data = [((f[0], f[1]), float(f[2])) for f in [line.strip().split("|||") for line in open(sys.argv[1])]] print "sample data:", data[:3] train_data, devel_data, test_data = cut(data) logging.info('loading model...') glove_embedding = GloveEmbedding(sys.argv[2]) logging.info('done!') dim = int(sys.argv[3]) X_train = featurize(train_data, glove_embedding, dim) Y_train = np.array([e[1] for e in train_data]) logging.info("Input shape: {0}".format(X_train.shape)) print X_train[:3] logging.info("Label shape: {0}".format(Y_train.shape)) print Y_train[:3] input_dim = X_train.shape[1] output_dim = 1 model = create_model(input_dim, output_dim) model.fit(X_train, Y_train, nb_epoch=int(sys.argv[4]), batch_size=32) X_devel = featurize(devel_data, glove_embedding, dim) Y_devel = np.array([e[1] for e in devel_data]) pred = model.predict_proba(X_devel, batch_size=32) corr = spearmanr(pred, Y_devel) print "Spearman's R: {0}".format(corr)
def _index_field(field, document, item, schema_version, schema): prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper() if schema_version == 2: # 可选term存为0 if field['field_name'] in ['retweeted_status']: term = _marshal_term(item[field['field_name']], schema['pre'][field['field_name']]) if field['field_name'] in item else '0' document.add_term(prefix + term) # 必选term elif field['field_name'] in ['user']: term = _marshal_term(item[field['field_name']], schema['pre'][field['field_name']]) document.add_term(prefix + term) # value elif field['field_name'] in ['_id', 'timestamp', 'reposts_count', 'comments_count', 'attitudes_count']: document.add_value(field['column'], _marshal_value(item[field['field_name']])) elif field['field_name'] == 'text': tokens = cut(s, item[field['field_name']].encode('utf-8')) termgen = xapian.TermGenerator() termgen.set_document(document) termgen.index_text_without_positions(' '.join(tokens), 1, prefix) """ for token, count in Counter(tokens).iteritems(): document.add_term(prefix + token, count) """ elif schema_version == 1: # 必选term if field['field_name'] in ['name', 'location', 'province']: term = _marshal_term(item[field['field_name']]) document.add_term(prefix + term) # 可选value elif field['field_name'] in ['created_at']: value = _marshal_value(item[field['field_name']], schema['pre'][field['field_name']]) if field['field_name'] in item else '0' document.add_value(field['column'], value) # 必选value elif field['field_name'] in ['_id', 'followers_count', 'statuses_count', 'friends_count', 'bi_followers_count']: document.add_value(field['column'], _marshal_value(item[field['field_name']]))
def detect_fissures(img): pimg = np.array(img) limiar = utils.define_threshold(img) if limiar > 0 and limiar < 0.05: offset = 10 elif limiar > 0.05: offset = 25 pimg = utils.define_skeleton(img, 45, offset) if limiar > 0.05: pimg = utils.filtragem(pimg) runs = 30 lines = [] for _ in range(runs): lines.extend(transform.probabilistic_hough_line(pimg)) directions = np.array( [utils.angle(point1, point2) for point1, point2 in lines]) directions = np.where(directions < 0, directions + 180, directions) hist = np.histogram(directions, range=[0, 180], bins=180) sort_indexes = np.argsort(hist[0]) hist = hist[0][sort_indexes], hist[1][sort_indexes] a1, a2 = hist[1][-2:] rot_pimg1 = skimage.transform.rotate(pimg, a1 - 90, resize=True) rot_pimg2 = skimage.transform.rotate(pimg, a2 - 90, resize=True) width = 3 kernel1 = np.pad(np.ones([rot_pimg1.shape[0], width]), 1, mode='constant', constant_values=-1) kernel2 = np.pad(np.ones([rot_pimg2.shape[0], width]), 1, mode='constant', constant_values=-1) corr1 = sp.ndimage.correlate(rot_pimg1, kernel1, mode='constant') corr2 = sp.ndimage.correlate(rot_pimg2, kernel2, mode='constant') corr_rot1 = utils.cut( skimage.transform.rotate(corr1, 90 - a1, resize=True), pimg.shape) corr_rot2 = utils.cut( skimage.transform.rotate(corr2, 90 - a2, resize=True), pimg.shape) thresholds = [ filters.threshold_isodata, filters.threshold_li, filters.threshold_mean, filters.threshold_minimum, filters.threshold_otsu, filters.threshold_triangle, filters.threshold_yen ] best_fitness = -np.inf best_mask = None for thr in thresholds: binary1 = np.where(corr_rot1 > thr(corr1), 1, 0) binary2 = np.where(corr_rot2 > thr(corr2), 1, 0) selem = np.ones((5, 5)) binary_dilated1 = skimage.morphology.dilation(binary1, selem=selem) binary_dilated2 = skimage.morphology.dilation(binary2, selem=selem) mask = np.logical_or(binary_dilated1, binary_dilated2) fitness_value = fitness(mask, pimg) if fitness_value > best_fitness: best_fitness = fitness_value best_mask = mask mask = best_mask rachaduras = (1 - mask) * pimg # rachaduras_rgba = np.where(rachaduras[..., np.newaxis] == 1, [255,0,0,255], [0,0,0,0]) return rachaduras
def triple_classifier(tweet): ''' 输出结果: 0 中性 1 积极 2 生气 3 焦虑 4 悲伤 5 厌恶 6 消极其他 ''' sentiment = 0 # text = tweet['text'] # encode text = tweet['text_ch'] keywords_list = [] try: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] keywords_list = entry if emoticon_sentiment == HAPPY: sentiment = emoticon_sentiment text = u'' else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: sentiment = emoticon_sentiment text = u'' if text != u'': entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] keywords_list = entry bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step2_score[pair[0]][1]**pair[1]) s[2] = s[2] * (step2_score[pair[0]][2]**pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: if s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY else: sentiment = 6 else: sentiment = 0 except: pass return sentiment, keywords_list
def vlans(self): return [int(x) for x in utils.cut(utils.loads(self.vlan_list()))]
def images(self): return utils.cut(utils.loads(self.image_list()))
all_users = [] all_uids = [] all_groups = [] all_gids = [] for user in passwd_file: all_users.append(user[0]) all_uids.append(int(user[1])) for group in group_file: all_groups.append(group[0]) all_gids.append(int(group[1])) defaults = cat('defaults') home = cut(grep(defaults, 'HOME')) shell = cut(grep(defaults, 'SHELL')) group = cut(grep(defaults, 'GROUP')) skel = cut(grep(defaults, 'SKEL')) del (defaults) login_defs = cat('login.defs') pass_max_days = cut(grep(login_defs, 'PASS_MAX_DAYS')) pass_min_days = cut(grep(login_defs, 'PASS_MIN_DAYS')) pass_warn_age = cut(grep(login_defs, 'PASS_WARN_AGE')) uid_min = int(cut(grep(login_defs, 'UID_MIN'))) uid_max = int(cut(grep(login_defs, 'UID_MAX'))) gid_min = int(cut(grep(login_defs, 'GID_MIN'))) gid_max = int(cut(grep(login_defs, 'GID_MAX'))) is_home_needed = cut(grep(login_defs, 'CREATE_HOME')) is_usegroups_enabled = cut(grep(login_defs, 'USERGROUPS_ENAB'))
def test_scws(self): sentence = u'中国好声音' s = load_scws() tokens = cut(s, sentence.encode('utf-8')) self.assertNotEqual(tokens, None, 'scws failed')
if limiar > 0.05: width = 3 #kernel1 = np.pad(np.ones([rot_pimg1.shape[0], width]), 1, mode='constant', constant_values=0) #kernel2 = np.pad(np.ones([rot_pimg2.shape[0], width]), 1, mode='constant', constant_values=0) kernel1 = np.ones([rot_pimg1.shape[0], width]) kernel2 = np.ones([rot_pimg2.shape[0], width]) corr1 = ndimage.correlate(rot_pimg1, kernel1, mode='constant') corr2 = ndimage.correlate(rot_pimg2, kernel2, mode='constant') print("Aplicação do filtro") plot_img_grid([[corr1, corr2]]) plt.show() print("Imagem filtrada rotacionada a posição original") corr_rot1 = utils.cut(transform.rotate(corr1, 90-a1, resize=True), pimg.shape) corr_rot2 = utils.cut(transform.rotate(corr2, 90-a2, resize=True), pimg.shape) plot_img_grid([[corr_rot1, corr_rot2]]) plt.show() binary1 = np.where(corr_rot1 > filters.threshold_yen(corr1), 1, 0) binary2 = np.where(corr_rot2 > filters.threshold_yen(corr2), 1, 0) print('Imagem filtrada binarizada') plot_img_grid([[binary1, binary2]]) plt.show() print('Imagem filtrada binarizada dilatada') selem = np.array([[0,1,0],[1,1,1],[0,1,0]]) binary_dilated1 = morphology.dilation(binary1, selem=selem) binary_dilated2 = morphology.dilation(binary2, selem=selem) plot_img_grid([[binary_dilated1, binary_dilated2]])
def triple_classifier(tweet): """ 输出结果: 0 中性 1 积极 2 生气 3 焦虑 4 悲伤 5 厌恶 6 消极其他 """ sentiment = 0 text = tweet["text"] # encode keywords_list = [] emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: entries = cut(cut_str, text) entry = [e.decode("utf-8", "ignore") for e in entries] keywords_list = entry if emoticon_sentiment == HAPPY: sentiment = emoticon_sentiment text = "" else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: sentiment = emoticon_sentiment text = "" if text != "": entries = cut(cut_str, text) entry = [e.decode("utf-8", "ignore") for e in entries] keywords_list = entry bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1]) s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: if s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY else: sentiment = 6 else: sentiment = 0 return sentiment
with open(stop_file, "r") as f: stop_list = f.readlines() stop_list = list(map(lambda x: x.rstrip("\n"), stop_list)) with open("./data/adult_content.txt", "r") as f: lines = f.readlines() lines = list(map(lambda x: x.rstrip("\n"), lines)) lines_n = [] for line in lines: line = line.split("\t") line = " ".join(line[-4:]) # line = "".join(line.split()) line = " ".join(line.split()) line = " ".join( list(cut([line], use_stop=use_stop, stop_list=stop_list))) line += "\t1" lines_n.append(line) with open("./data/normal_content.txt", "r") as f: lines_p = f.readlines() lines_p = list(map(lambda x: x.rstrip("\n"), lines_p)) lines_p = list(map(lambda x: " ".join(x.split()), lines_p)) lines_p = cut(lines_p, use_stop=use_stop, stop_list=stop_list) lines_p = list(map(lambda x: x + "\t0", lines_p)) lines_all = lines_n + lines_p random.shuffle(lines_all) with open("data/data.txt", "w") as f: