def keyword_statistics(relationship_file, subtitle_file): relation_list = csv_io.read_csv(relationship_file) subtitle = read_subtitle_file(subtitle_file) relation_patterns = {} for relation in relation_list: relation_patterns[relation] = '(^[->]*' + relation.lower() + '[,|\.|\?|\!].*)' + '|' + \ '(?<!(her|his|our|eir|our|\smy|.\sa))\s+' + relation.lower() + '[\.|,|\?|!|>]' subtitle_interval = [] time_to_keyword = [] keyword_list = [] for line in subtitle: if line.strip(): subtitle_interval.append(line) if len(subtitle_interval) < 2: continue if len(subtitle_interval) == 2: subtitle_time = line[:-2] continue time_to_keyword, keyword_list = keyword_matching(relation_patterns, line, subtitle_time,\ time_to_keyword, keyword_list) else: subtitle_interval=[] frame_to_keyword = to_frame_keyword(time_to_keyword) csv_io.write_csv(OUTPUT_ROOT_PATH + 'statistics_result.csv', frame_to_keyword) csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
def keyword_statistics(relationship_file, subtitle_file): relation_list = csv_io.read_csv(relationship_file) subtitle = read_subtitle_file(subtitle_file) relation_patterns = {} for relation in relation_list: relation_patterns[relation] = '(^[->]*' + relation.lower() + '[,|\.|\?|\!].*)' + '|' + \ '(?<!(her|his|our|eir|our|\smy|.\sa))\s+' + relation.lower() + '[\.|,|\?|!|>]' subtitle_interval = [] time_to_keyword = [] keyword_list = [] for line in subtitle: if line.strip(): subtitle_interval.append(line) if len(subtitle_interval) < 2: continue if len(subtitle_interval) == 2: subtitle_time = line[:-2] continue time_to_keyword, keyword_list = keyword_matching(relation_patterns, line, subtitle_time,\ time_to_keyword, keyword_list) else: subtitle_interval = [] frame_to_keyword = to_frame_keyword(time_to_keyword) csv_io.write_csv(OUTPUT_ROOT_PATH + 'statistics_result.csv', frame_to_keyword) csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8')) csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens]) del doc_obj doc_id += 1
def keyword_search(name_file, relationship_file, subtitle_file): # Read files name_list = csv_io.read_csv(name_file) relation_list = csv_io.read_csv(relationship_file) subtitle = read_subtitle_file(subtitle_file) # Create regular expression pattern for reuse name_patterns = {} for name in name_list: name_patterns[name] = '[\s]*' + name.lower() + "[^'\w]" relation_patterns = {} for relation in relation_list: relation_patterns[relation] = '[\s]*' + relation.lower() + "[^'\w]" # Find keyword time_to_keyword = [] subtitle_interval = [] keyword_number = 0 keyword_list = [""] keyword_count = {} for line in subtitle: if line.strip(): subtitle_interval.append(line) if len(subtitle_interval) < 2: continue if len(subtitle_interval) == 2: subtitle_time = line[:-2] continue for name in name_patterns: if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(name_patterns[name], line.lower()): time_to_keyword.append([subtitle_time, name]) keyword_number += 1 if name not in keyword_list: keyword_list.append(name) keyword_count[name] = 1 else: keyword_count[name] += 1 for relation in relation_patterns: if keyword_number < MAX_KEYWORDS_IN_ONE_INTERVAL and re.search(relation_patterns[relation], line.lower()): time_to_keyword.append([subtitle_time, relation]) keyword_number += 1 if relation not in keyword_list: keyword_list.append(relation) keyword_count[relation] = 1 else: keyword_count[relation] += 1 else: if keyword_number == MAX_KEYWORDS_IN_ONE_INTERVAL: for i in range(MAX_KEYWORDS_IN_ONE_INTERVAL): time_to_keyword.pop() subtitle_interval=[] keyword_number=0 count = Counter(values[1] for values in time_to_keyword) total_count = sum(keyword_count.values()) filter_list = [] for name, freq in count.iteritems(): if float(freq)/total_count >= 0.012: print name else: filter_list.append(name) for name in filter_list: keyword_list.remove(name) time_to_keyword = list( (values[0], values[1]) for values in time_to_keyword if values[1] != name) # Find the max keyword count as leading keyword keyword_list[0] = max(keyword_count, key=keyword_count.get) csv_io.write_csv(OUTPUT_ROOT_PATH + 'search_result.csv', time_to_keyword) csv_io.write_csv(OUTPUT_ROOT_PATH + 'keyword_list.csv', [keyword_list])
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- import sys from my_class.Document import Document from doc_preprocessing import get_docs_list from modules import csv_io def n_gram(content, n): tokens = [] for i in range(len(content)-n+1): tokens.append(content[i:i+n].encode('utf-8')) return tokens if __name__=='__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/zh_doc/' document_list = get_docs_list(doc_input) doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) content = doc_obj.read().decode('utf-8') tokens = n_gram(content, 2) csv_io.write_csv('output/zh_tokens/' + doc, [tokens]) del doc_obj doc_id += 1
# -*- coding: utf-8 -*- import sys from my_class.Document import Document from doc_preprocessing import get_docs_list from modules import csv_io def n_gram(content, n): tokens = [] for i in range(len(content) - n + 1): tokens.append(content[i:i + n].encode('utf-8')) return tokens if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/zh_doc/' document_list = get_docs_list(doc_input) doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) content = doc_obj.read().decode('utf-8') tokens = n_gram(content, 2) csv_io.write_csv('output/zh_tokens/' + doc, [tokens]) del doc_obj doc_id += 1
from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__=='__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8')) csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens]) del doc_obj doc_id += 1