예제 #1
0
파일: clean.py 프로젝트: nitin7/WordBreak
def main(arg):
    dir = os.path.dirname(__file__)
    filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*')
    configFileLoc = os.path.join(dir, 'config.ini')
    proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename])
    with open(arg, "r") as file:
        data = removeHeadings(file)
        parsed = proc.parse_doc(data)
        data = []
        for s in parsed[u'sentences']:
            sent = str(' '.join(s[u'tokens']))
            data.append(sent.translate(string.maketrans("",""), string.punctuation))

        data1 = ".".join(data)
        data1 = data1.replace("..",".")
        data1 = data1.replace("  "," ")
        data1 = data1.replace(" .",". ")
        data2 = " ".join(data)
        data2 = data2.replace("  "," ")
        file_train1 = open("data/a1_train1.txt", "w")
        file_train1.write(data1)
        file_train1.close()
        
        file_train2 = open("data/a1_train2.txt", "w")
        file_train2.write(data2)
        file_train2.close()
        
        file_test1 = open("data/a1_test1.txt", "w")
        file_test1.write(clean1(data1))
        file_test1.close()

        file_test2 = open("data/a1_test2.txt", "w")
        file_test2.write(clean(data2))
        file_test2.close()
예제 #2
0
파일: example.py 프로젝트: nitin7/WordBreak
def main(arg="iamtoocoolforthis"):

    s = clean(arg)
    print "CLEANED STRING:", s
    print "======================RUNNING OPTIMIZED==================="
    print segment_method1(s)
    print "======================RUNNING VANILLA==================="
    print segment(s)
def apply_preprocessing(s):
    filters = [strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, remove_stopwords]
    tokens = preprocess_string(s, filters)
    result = []
    for token in tokens:
        segmented = segment(clean(token))
        for i in segmented:
            result.append(i)
    return result
예제 #4
0
def word_segment(text, limit=250):
    next_text = wordsegment.clean(text)
    word_list = []

    while len(next_text) > limit:
        current_text = next_text[:limit]
        next_text = next_text[limit:]
        word_list.extend(wordsegment.segment(current_text))
        next_text = ''.join([word_list[i] for i in xrange(-5, 0)]) + next_text
        word_list = word_list[:-5]
        gc.collect()

    word_list.extend(wordsegment.segment(next_text))
    text = ' '.join(w for w in word_list)
    return text
예제 #5
0
def main():
    #list for sentences
    sentences = []
    file_name = "mergefile.txt"
    file = open(file_name, "r", encoding="utf-8")
    data = file.readlines()
    for i in data:
        i = i.strip('\n').replace('["',"").replace('"]',"")
        sentence = i.split('","')[1]	
        # delete one record in data "485501831406026752"
        words = sentence.split(" ")
        #add list of words to sentence list
        sentences.append(words)
     
    #for loading word segment     
    load()
      
    #for storing cleaned and segmented/spaced out words (for hashtag seperation)
    cleanedWords = []
    #for storing resultant sentences
    cleanedSentences = []
    
    for sentence in sentences:
        for word in sentence:
        
            #finds hashtags by using # and wildcard along with fnmatch module
            filtered = fnmatch.filter(word, '#*')
               
            #if no hashtag in the current word, append cleaned version of it which removes some punctuation, lower cases, and otherwise preprocesses
            if not filtered:
                cleanedWords.append(clean(word))
                
            #otherwise use segment to try to break it up a hashtag into distinct words
            else:
                cleanedWords.extend(segment(word))
                
        #add resultant list of words to cleaned sentence structure as well as resetting container for word lists
        cleanedSentences.append(cleanedWords)
        cleanedWords= []
    
    #if you with to output them to a csv instead
    with open("out.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(cleanedSentences)
예제 #6
0
def test_clean():
    assert clean("Can't buy me love!") == 'cantbuymelove'
예제 #7
0
def test_clean():
    assert clean("Can't buy me love!") == 'cantbuymelove'
 def segment(self, word):
     cleaned = clean(word)
     segmented = segment(cleaned)
     return segmented