def main(arg): dir = os.path.dirname(__file__) filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*') configFileLoc = os.path.join(dir, 'config.ini') proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename]) with open(arg, "r") as file: data = removeHeadings(file) parsed = proc.parse_doc(data) data = [] for s in parsed[u'sentences']: sent = str(' '.join(s[u'tokens'])) data.append(sent.translate(string.maketrans("",""), string.punctuation)) data1 = ".".join(data) data1 = data1.replace("..",".") data1 = data1.replace(" "," ") data1 = data1.replace(" .",". ") data2 = " ".join(data) data2 = data2.replace(" "," ") file_train1 = open("data/a1_train1.txt", "w") file_train1.write(data1) file_train1.close() file_train2 = open("data/a1_train2.txt", "w") file_train2.write(data2) file_train2.close() file_test1 = open("data/a1_test1.txt", "w") file_test1.write(clean1(data1)) file_test1.close() file_test2 = open("data/a1_test2.txt", "w") file_test2.write(clean(data2)) file_test2.close()
def main(arg="iamtoocoolforthis"): s = clean(arg) print "CLEANED STRING:", s print "======================RUNNING OPTIMIZED===================" print segment_method1(s) print "======================RUNNING VANILLA===================" print segment(s)
def apply_preprocessing(s): filters = [strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, remove_stopwords] tokens = preprocess_string(s, filters) result = [] for token in tokens: segmented = segment(clean(token)) for i in segmented: result.append(i) return result
def word_segment(text, limit=250): next_text = wordsegment.clean(text) word_list = [] while len(next_text) > limit: current_text = next_text[:limit] next_text = next_text[limit:] word_list.extend(wordsegment.segment(current_text)) next_text = ''.join([word_list[i] for i in xrange(-5, 0)]) + next_text word_list = word_list[:-5] gc.collect() word_list.extend(wordsegment.segment(next_text)) text = ' '.join(w for w in word_list) return text
def main(): #list for sentences sentences = [] file_name = "mergefile.txt" file = open(file_name, "r", encoding="utf-8") data = file.readlines() for i in data: i = i.strip('\n').replace('["',"").replace('"]',"") sentence = i.split('","')[1] # delete one record in data "485501831406026752" words = sentence.split(" ") #add list of words to sentence list sentences.append(words) #for loading word segment load() #for storing cleaned and segmented/spaced out words (for hashtag seperation) cleanedWords = [] #for storing resultant sentences cleanedSentences = [] for sentence in sentences: for word in sentence: #finds hashtags by using # and wildcard along with fnmatch module filtered = fnmatch.filter(word, '#*') #if no hashtag in the current word, append cleaned version of it which removes some punctuation, lower cases, and otherwise preprocesses if not filtered: cleanedWords.append(clean(word)) #otherwise use segment to try to break it up a hashtag into distinct words else: cleanedWords.extend(segment(word)) #add resultant list of words to cleaned sentence structure as well as resetting container for word lists cleanedSentences.append(cleanedWords) cleanedWords= [] #if you with to output them to a csv instead with open("out.csv", "w", newline="") as f: writer = csv.writer(f) writer.writerows(cleanedSentences)
def test_clean(): assert clean("Can't buy me love!") == 'cantbuymelove'
def segment(self, word): cleaned = clean(word) segmented = segment(cleaned) return segmented