def Preprocessing(mailID_content, preprocessing_file): if os.path.exists(os.path.abspath(preprocessing_file)): print 'The text is already be processed in %s' % os.path.abspath( preprocessing_file) f = open(os.path.abspath(preprocessing_file), 'rb') content = pickle.load(f) f.close() return content else: print 'You choose a new processing method for text, the text will be stored in %s' % os.path.abspath( preprocessing_file) keys, values = mailID_content.keys(), mailID_content.values() for i, content in enumerate(values): content = clean_text_simple(content) content = [word for word in content if len(word) > 2] values[i] = ' '.join(content) if i % 10000 == 0: print 'process %d emails already...' % i content = dict(zip(keys, values)) f = open(os.path.abspath(preprocessing_file), 'wb') pickle.dump(content, f) f.close() return content
# read file with open(path_to_abstracts + '/' + filename, 'r') as my_file: text = my_file.read().splitlines() text = ' '.join(text) # remove formatting text = re.sub('\s+', ' ', text) abstracts.append(text) if counter % round(len(abstract_names) / 5) == 0: print(counter, 'files processed') print('') abstracts_cleaned = [] for counter, abstract in enumerate(abstracts): my_tokens = clean_text_simple(abstract, my_stopwords=stpwds, punct=punct) abstracts_cleaned.append(my_tokens) if counter % round(len(abstracts) / 5) == 0: print(counter, 'abstracts processed') print('') ############################################### # read and pre-process gold standard keywords # ############################################### path_to_keywords = "data/Hulth2003testing/uncontr/" keywd_names = sorted(os.listdir(path_to_keywords)) keywds_gold_standard = []
from library import clean_text_simple, terms_to_graph, unweighted_k_core stpwds = stopwords.words('english') punct = string.punctuation.replace('-', '') my_doc = '''A method for solution of systems of linear algebraic equations with m-dimensional lambda matrices. A system of linear algebraic equations with m-dimensional lambda matrices is considered. The proposed method of searching for the solution of this system lies in reducing it to a numerical system of a special kind.''' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct) g = terms_to_graph(my_tokens, w=4) # number of edges print(len(g.es)) # the number of nodes should be equal to the number of unique terms assert len(g.vs) == len(set(my_tokens)) edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight])
# nltk.download('maxent_treebank_pos_tagger') # nltk.download('stopwords') # import custom functions from library import clean_text_simple, terms_to_graph, unweighted_k_core my_doc = '''A method for solution of systems of linear algebraic equations with m-dimensional lambda matrices. A system of linear algebraic equations with m-dimensional lambda matrices is considered. The proposed method of searching for the solution of this system lies in reducing it to a numerical system of a special kind.''' my_doc = my_doc.replace('\n', '') # pre-process document my_tokens = clean_text_simple(my_doc) g = terms_to_graph(my_tokens, w=4) # number of edges len(g.es) # the number of nodes should be equal to the number of unique terms len(g.vs) == len(set(my_tokens)) edge_weights = [] for edge in g.es: source = g.vs[edge.source]['name'] target = g.vs[edge.target]['name'] weight = edge['weight'] edge_weights.append([source, target, weight])
with open(path_to_abstracts + '/' + filename, 'r') as my_file: text = my_file.read().splitlines() text = ' '.join(text) # remove formatting text = re.sub('\s+', ' ', text) abstracts.append(text) counter += 1 if counter % 100 == 0: print counter, 'files processed' abstracts_cleaned = [] counter = 0 for abstract in abstracts: my_tokens = clean_text_simple(abstract) abstracts_cleaned.append(my_tokens) counter += 1 if counter % 100 == 0: print counter, 'abstracts processed' ################################# # read and pre-process keywords # ################################# path_to_keywords = "../datasets/Hulth2003testing/uncontr" # fill me (absolute path) keywords_names = sorted(os.listdir(path_to_keywords)) keywords_gold_standard = [] counter = 0