for file in files: file_path = dir_path + file file_name, file_extension = splitext(file_path) doc = "" if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) else: continue if doc != "": doc_sentence = [] doc = doc.decode("utf8").encode('ascii', 'ignore').decode('ascii') doc = words_to_phrases(doc) doc = doc.lower() sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=True) doc_sentence = doc_to_sentences(doc, tokenizer, remove_stopwords=True) print(file_path) file1 = file.split("_") month = file1[2].split(".")[0]+"-"+file1[1] author = file1[0] for item in doc_sentence: item = " ".join(item) #sentences_string += "\n"+item+" | "+author+" | "+file.decode("utf8").encode('ascii', 'ignore').decode('ascii')+" | "+month print(sentences_string.count("\n")) with open("sentences.txt", "w") as text_file: text_file.write("%s" % sentences_string)
doc = "" if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) else: continue if doc != "": doc_sentence = [] doc = doc.decode("utf8").encode('ascii', 'ignore').decode('ascii') doc = words_to_phrases(doc) doc = doc.lower() sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=True) doc_sentence = doc_to_sentences(doc, tokenizer, remove_stopwords=True) print(file_path) file1 = file.split("_") month = file1[2].split(".")[0] + "-" + file1[1] author = file1[0] for item in doc_sentence: item = " ".join(item) #sentences_string += "\n"+item+" | "+author+" | "+file.decode("utf8").encode('ascii', 'ignore').decode('ascii')+" | "+month print(sentences_string.count("\n")) with open("sentences.txt", "w") as text_file: text_file.write("%s" % sentences_string)
try: if file_extension == ".pdf": doc = convert_pdf_to_txt(file_path) elif file_extension == ".docx": doc = convert_docx_to_txt(file_path) else: continue except: continue if doc != "": doc = doc.decode("utf8") doc = words_to_phrases(doc) doc = doc.lower() sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False) print len(sentences) # sentences_string = "" # for item in sentences: # item = " ".join(item) # sentences_string += "\n"+item # # # with open("sentences.txt", "w") as text_file: # text_file.write("%s" % sentences_string) #print sentences[0] # Set values for various parameters
doc = fp.read() # elif str(file).__contains__(".pdf"): # doc = convert_pdf_to_txt(file) # elif str(file).__contains__(".docx"): # doc = convert_docx_to_txt(file) else: continue except: continue if doc != "": doc = doc.decode("utf8") doc = words_to_phrases(doc) doc = doc.lower() sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False) print len(sentences) with open("csc791_sentences.txt", "w") as text_file: text_file.write("%s" % sentences) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 10 # Minimum word count num_workers = 2 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) print "Training model..." model = word2vec.Word2Vec(sentences,