예제 #1
0
def Preprocessing(mailID_content, preprocessing_file):
    if os.path.exists(os.path.abspath(preprocessing_file)):
        print 'The text is already be processed in %s' % os.path.abspath(
            preprocessing_file)
        f = open(os.path.abspath(preprocessing_file), 'rb')
        content = pickle.load(f)
        f.close()
        return content
    else:
        print 'You choose a new processing method for text, the text will be stored in %s' % os.path.abspath(
            preprocessing_file)
        keys, values = mailID_content.keys(), mailID_content.values()
        for i, content in enumerate(values):
            content = clean_text_simple(content)
            content = [word for word in content if len(word) > 2]
            values[i] = ' '.join(content)
            if i % 10000 == 0:
                print 'process %d emails already...' % i
        content = dict(zip(keys, values))
        f = open(os.path.abspath(preprocessing_file), 'wb')
        pickle.dump(content, f)
        f.close()
    return content
예제 #2
0
    # read file
    with open(path_to_abstracts + '/' + filename, 'r') as my_file:
        text = my_file.read().splitlines()
    text = ' '.join(text)
    # remove formatting
    text = re.sub('\s+', ' ', text)
    abstracts.append(text)

    if counter % round(len(abstract_names) / 5) == 0:
        print(counter, 'files processed')

print('')

abstracts_cleaned = []
for counter, abstract in enumerate(abstracts):
    my_tokens = clean_text_simple(abstract, my_stopwords=stpwds, punct=punct)
    abstracts_cleaned.append(my_tokens)

    if counter % round(len(abstracts) / 5) == 0:
        print(counter, 'abstracts processed')

print('')

###############################################
# read and pre-process gold standard keywords #
###############################################

path_to_keywords = "data/Hulth2003testing/uncontr/"
keywd_names = sorted(os.listdir(path_to_keywords))

keywds_gold_standard = []
예제 #3
0
from library import clean_text_simple, terms_to_graph, unweighted_k_core

stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

my_doc = '''A method for solution of systems of linear algebraic equations 
with m-dimensional lambda matrices. A system of linear algebraic 
equations with m-dimensional lambda matrices is considered. 
The proposed method of searching for the solution of this system 
lies in reducing it to a numerical system of a special kind.'''

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct)

g = terms_to_graph(my_tokens, w=4)

# number of edges
print(len(g.es))

# the number of nodes should be equal to the number of unique terms
assert len(g.vs) == len(set(my_tokens))

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])
# nltk.download('maxent_treebank_pos_tagger')
# nltk.download('stopwords')

# import custom functions
from library import clean_text_simple, terms_to_graph, unweighted_k_core

my_doc = '''A method for solution of systems of linear algebraic equations 
with m-dimensional lambda matrices. A system of linear algebraic 
equations with m-dimensional lambda matrices is considered. 
The proposed method of searching for the solution of this system 
lies in reducing it to a numerical system of a special kind.'''

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc)

g = terms_to_graph(my_tokens, w=4)

# number of edges
len(g.es)

# the number of nodes should be equal to the number of unique terms
len(g.vs) == len(set(my_tokens))

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])
예제 #5
0
    with open(path_to_abstracts + '/' + filename, 'r') as my_file:
        text = my_file.read().splitlines()
    text = ' '.join(text)
    # remove formatting
    text = re.sub('\s+', ' ', text)
    abstracts.append(text)

    counter += 1
    if counter % 100 == 0:
        print counter, 'files processed'

abstracts_cleaned = []
counter = 0

for abstract in abstracts:
    my_tokens = clean_text_simple(abstract)
    abstracts_cleaned.append(my_tokens)
    counter += 1
    if counter % 100 == 0:
        print counter, 'abstracts processed'

#################################
# read and pre-process keywords #
#################################

path_to_keywords = "../datasets/Hulth2003testing/uncontr"  # fill me (absolute path)
keywords_names = sorted(os.listdir(path_to_keywords))

keywords_gold_standard = []
counter = 0