示例#1
0
def extract_features(parsed_file, unified_file, cube_file, cube_context_file, freq_data_file, stop_word_file, freq_pattern_file, base_dir, total_docs, filtered_cell_str):
  print filtered_cell_str
  freq_patterns = load_freq_patterns(freq_pattern_file)
  freq_data = load_freq(freq_data_file)
  cubes = load_cube(cube_file)
  contexts = load_context(cube_context_file)
  unified_list = load_unified_list(unified_file)
  print contexts.keys()
  #cubes['all'] = [i for i in range(total_docs)]
  all_docs = [i for i in range(total_docs)]
  total_cnt = agg_phrase_cnt(freq_data, all_docs)

  print sum(total_cnt.values())
 
  #extract the features of phrases in each cube
  phrase_feature_all = {}
  idx = 0
  for att in cubes:
    if att != filtered_cell_str: #'Topic|Sports;Location|Illinois;':
      continue
    print "start processing " + att
    selected_doc = cubes[att]
    selected_context = contexts[att]
    #print selected_context
    feature_extractor = FeatureExtractor(parsed_file, selected_doc, selected_context, freq_data, stop_word_file, freq_patterns, total_cnt, unified_list)
    phrase_features = feature_extractor.extract_features()
    for phrase in phrase_features:
      norm_phrase = normalize(phrase)
      phrase_features[phrase].append(unified_list[norm_phrase])
      cell_phrase = "{0}{1}".format(att.replace('|', '_').replace(';', '_').replace(' ', '_').lower(), norm_phrase)
      phrase_feature_all[cell_phrase] = phrase_features[phrase]

  file_name = "{0}/{1}.fea".format(base_dir, "cells.fea")
  save_features(file_name, phrase_feature_all)
示例#2
0
def extract_features(parsed_file, cube_file, freq_data_file, stop_word_file, freq_pattern_file, base_dir, total_docs):
  freq_patterns = load_freq_patterns(freq_pattern_file)
  freq_data = load_freq(freq_data_file)
  cubes = load_cube(cube_file)
  cubes['all'] = [i for i in range(total_docs)]

  #extract the features of phrases in each cube
  for att in cubes:
    selected_doc = cubes[att]
    feature_extractor = FeatureExtractor(parsed_file, selected_doc, freq_data, stop_word_file, freq_patterns)
    phrase_features = feature_extractor.extract_features()

    file_name = "{0}/{1}.fea".format(base_dir, att.replace('|', '_').replace(';', '_').replace(' ', '_'))
    save_features(file_name, phrase_features)
示例#3
0
def concat_docs(cell_file, doc_file, output_file, cell_str):
	cells = load_cube(cell_file)

	with open(doc_file, 'r') as doc_file:
		docs = doc_file.readlines()



	for att in cells:
		if att != cell_str:
			continue
		selected_docs = cells[att]

		with open(output_file, 'w+') as g:
			for doc_id in selected_docs:
				g.write(docs[doc_id])
示例#4
0
文件: mcx.py 项目: fangbo-tao/TextExp
def rank_phrase(unified_file, forward_map_file, cell_file, output_file, n_docs, filtered_cell_str):
	phrase_dict = parse_unified_file(unified_file)
	forward_map = parse_forward_map(forward_map_file)

	cells = load_cube(cell_file)

	for att in cells:
		if att != filtered_cell_str: #'Topic|Sports;Location|Illinois;':
			continue
		print "start processing " + att
		# get all phrasesas
		selected_docs = cells[att]
		dynamic_phrases = merge_forward_list(phrase_dict, forward_map, selected_docs)
		#context_score_vldb(dynamic_phrases, phrase_dict, threshold=5)
		context_score_vldb(dynamic_phrases, phrase_dict)
		sorted_phrase = sorted(dynamic_phrases.items(), key=operator.itemgetter(1), reverse=True)
		with open(output_file, 'w+') as g:
			for pair in sorted_phrase:
				g.write(pair[0] + '\t' + str(pair[1]) + '\n')