Пример #1
0
def loadData(topic):
	docName=""
	if ".txt" in topic:
		#docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic
		docName=topic
	else:
		# call bash script to get and fix wiki article
		cmd="bash loader.sh "+topic.replace(' ', '_')
		#subprocess.Popen(cmd)
		process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
		output = process.communicate()[0]

		docName=DOCS_DIR+topic.replace(' ', '_')+".txt"

	# open document and parse into sentences and words

	
	data=[]
	with open (docName) as f:
		lines = f.readlines()
		for line in lines:
			words = line.split()
			for word in words:
				data.append(word)
				#print (word)


	# return word array for article
	return data
Пример #2
0
def extract_data(input_file):
    """
    Extracts data from the input_file '../../timeline.htm'
    looks out <div class = "comment"> and extracts the following comment information
    Dumps the comments in to a file, checks whether file already exists.

    :param input_file: specify location of 'timeline.htm' (str)
    :param output_file: specify the output filename (str)
    :return: data
    """

    #opens file and closes, stores the file in output
    print("Reading {}...".format(input_file))
    t0 = time.time()
    with open(input_file,"r") as f:
        output = f.read()

    #Parse output string data using BeautifulSoup
    soup = BeautifulSoup(output, "lxml")
    extract_div = soup.find_all("div", {"class":"comment"})

    #Extract lines and write to csv file
    data = []
    for line in extract_div:
        data.append(line.text)

    t1 = time.time()
    print("It took {} ms to extract the comments".format((t1-t0)*1000))

    return data
Пример #3
0
 def create_dataframe(self):
     data = []
     for dialogue in ifilter(lambda x: x.has_deal(), self.examples):
         for turn in dialogue.turns:
             for u in turn.iter_utterances():
                 row = {
                     'post_id': dialogue.post_id,
                     'chat_id': dialogue.chat_id,
                     'scenario_id': dialogue.scenario_id,
                     'buyer_target': dialogue.buyer_target,
                     'listing_price': dialogue.listing_price,
                     'margin_seller': dialogue.margins['seller'],
                     'margin_buyer': dialogue.margins['buyer'],
                     'stage': u.stage,
                     'role': turn.role,
                     'num_tokens': u.num_tokens(),
                 }
                 for a in u.speech_acts:
                     row['act_{}'.format(a[0].name)] = 1
                 for cat, word_count in u.categories.iteritems():
                     row['cat_{}'.format(cat)] = sum(word_count.values())
                 for q in dialogue.eval_questions:
                     for r in ('buyer', 'seller'):
                         key = 'eval_{question}_{role}'.format(question=q,
                                                               role=r)
                         try:
                             row[key] = dialogue.eval_scores[r][q]
                         except KeyError:
                             row[key] = -1
                 data.append(row)
     df = pd.DataFrame(data).fillna(0)
     return df
Пример #4
0
def question_sorter(data):
    from IPython.core.display import display, HTML
    display(HTML("<style>.container { width:100% !important; }</style>"))
    question_list = list(question_dict.values())
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    clean = '\n-----\n'.join(tokenizer.tokenize(data))
    split_list = clean.split('\n-----\n')

    matches1 = {}
    index_list = []

    for key in question_dict.keys():

        matches2 = {}
        i = 0

        for sentence in split_list:
            x = fuzz.ratio(question_dict[key], sentence)
            matches2[i] = x
            i += 1

        calc = max(matches2, key=matches2.get)
        maximum = [calc, matches2[calc]]
        start_index = max(matches2, key=matches2.get) + 1
        maximum.append(start_index)
        matches1[key] = maximum

    matches3 = {}

    for key, value in matches1.items():
        if value[1] >= 80:
            matches3[key] = matches1[key]
            index_list.append(matches1[key][0])
        else:
            pass

    index_list.pop(0)

    i2 = 0

    for key, value in matches3.items():
        try:
            value.append(index_list[i2] - 1)
            i2 += 1
        except IndexError:
            value.append(len(split_list) - 1)

    responses = {}

    for key, value in matches3.items():

        i = value[2]
        data = []

        while i <= value[3]:
            data.append(split_list[i])
            i += 1

        responses[key] = data
        return data
def process_tarfile_question_titles(tar):

    data = []

    for xml_text, filename in iter_xmls(tar):

        root = ET.fromstring(xml_text)

        title_node = root.find("front/article-meta/title-group/article-title")

        if title_node is None or title_node.text is None:
            logging.warning("No title: %s" % filename)
            continue

        title = title_node.text

        if title is None or title[-1] != "?":
            continue

        abstract_node = root.find("**/abstract")
        if abstract_node is None:
            logging.warning("No abstract: %s" % filename)
            continue
        abstract_xml = ET.tostring(abstract_node).decode("utf-8")

        data.append({
            "id": filename,
            "title": title,
            "abstract_xml": abstract_xml,
        })

    print("Done processing tarfile %s. %d Questions added." % (tar, len(data)))

    return data
Пример #6
0
def load_title(filename):
    data = []
    with open(filename) as fh:
        reader = csv.DictReader(fh)
        raw_data = list(reader)
        for row in raw_data:
            title = unicode(row['Headline'],
                            errors='ignore').decode('utf-8').strip()
            clean_title = clean(title)
            clean_title = get_tokenized_lemmas(clean_title)

            id = row['Body ID']
            # ignore the stance if there is any
            data.append((clean_title, id))
    return data

    reader = unicode_csv_reader(open(filename))
    for row in reader:
        title = row[0]
        clean_title = clean(title)
        clean_title = get_tokenized_lemmas(clean_title)

        id = row[1]
        # ignore the stance if there is any
        data.append((clean_title, id))
    return data
Пример #7
0
def loadData(topic):
    docName = ""
    if ".txt" in topic:
        #docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic
        docName = topic
    else:
        # call bash script to get and fix wiki article
        cmd = "bash loader.sh " + topic.replace(' ', '_')
        #subprocess.Popen(cmd)
        process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
        output = process.communicate()[0]

        docName = DOCS_DIR + topic.replace(' ', '_') + ".txt"

    # open document and parse into sentences and words

    data = []
    with open(docName) as f:
        lines = f.readlines()
        for line in lines:
            words = line.split()
            for word in words:
                data.append(word)
                #print (word)

    # return word array for article
    return data
Пример #8
0
def contextualize(data_set):
    data = []
    context = []
    for line in data_set:
        data.append(tuple(sentence2sequence(line)[0]))
        context.append(tuple(sentence2sequence(line)[1]))
    return data, context
def write_bert_tokenized_json_classification(filename, sentences, labels):
    data = []
    sentence_encodings = bert_tokenizer(sentences,
                                        return_offsets_mapping=False,
                                        padding=False,
                                        truncation=True)
    for i, label in enumerate(labels):
        token_id = sentence_encodings[i].ids
        type_id = sentence_encodings[i].type_ids
        data.append({
            'uid': str(i),
            'label': label,
            'token_id': token_id,
            'type_id': type_id
        })

    # Write the JSON dataset to ./ensemble_modeling directory
    with open(f'./ensemble_modeling/multi_task_learning/{filename}.json',
              'w') as json_file:
        for line in data:
            json.dump(line, json_file)
            json_file.write('\n')

    # Write the JSON dataset to mt-dnn canonical data directory
    with open(f'../mt-dnn/canonical_data/{filename}.json', 'w') as json_file:
        for line in data:
            json.dump(line, json_file)
            json_file.write('\n')
Пример #10
0
 def get_json(self):
     self.__get_files()
     data = []
     for topic in self.files:
         for file in self.files[topic]:
             content = FileReader(filePath=file).content()
             data.append({'category': topic, 'content': content})
     return data
Пример #11
0
def load_stance(filename):
    reader = unicode_csv_reader(open(filename))
    data = []
    for title, id, stance in reader:
        clean_title = clean(title)
        clean_title = get_tokenized_lemmas(clean_title)
        data.append((clean_title, id, stance.strip()))
    return data
def write_bert_tokenized_json_ner(filename, texts, spans):
    data = []
    offset_mappings = []
    texts = [text for text in texts]
    spans = [span for span in spans]
    text_encodings = bert_tokenizer(texts,
                                    return_offsets_mapping=True,
                                    padding=False,
                                    truncation=True)
    labels = [
        preserve_labels(text_encodings[i], span)
        for i, span in enumerate(spans)
    ]
    for i, label in enumerate(labels):
        # update the CLS and SEP label ids
        label[0], label[-1] = 2, 3
        # retrieve the token ids
        token_id = text_encodings[i].ids
        # retrieve the type ids
        type_id = text_encodings[i].type_ids
        # add tokenized post to data
        data.append({
            'uid': i,
            'label': label,
            'token_id': token_id,
            'type_id': type_id
        })
        # save the offsets mapping for computing scores later
        offset_mappings.append(text_encodings[i].offsets)

    # Write the JSON dataset to ./ensemble_modeling directory
    with open(f'./ensemble_modeling/multi_task_learning/{filename}.json',
              'w') as json_file:
        for line in data:
            json.dump(line, json_file)
            json_file.write('\n')

    # Write the JSON dataset to mt-dnn canonical_data directory
    with open(f'../mt-dnn/canonical_data/{filename}.json', 'w') as json_file:
        for line in data:
            json.dump(line, json_file)
            json_file.write('\n')

    # Write the token offset mappings
    with open(
            f'./ensemble_modeling/multi_task_learning/{filename}_offsets.txt',
            'w') as json_file:
        for line in offset_mappings:
            json.dump(line, json_file)
            json_file.write('\n')

    # Write the gold span labels
    with open(f'./ensemble_modeling/multi_task_learning/{filename}_spans.txt',
              'w') as json_file:
        for span in spans:
            json.dump(span, json_file)
            json_file.write('\n')
Пример #13
0
def join_data(tokens,tags,lem):
	data = []
	for i in range(len(tokens)):
		dados = []
		dados.append(tokens[i]) 
		dados.append(tags[i]) 
		dados.append(lem[i]) 
		data.append(dados)
	return data
Пример #14
0
    def __load_csv(self, filename):
        data = []
        with open(filename, 'rt') as csvfile:
            readr = csv.reader(csvfile, delimiter=' ', quotechar='|')
            for row in readr:
                if len(row) > 0:
                    data.append(row[0])

        return data
Пример #15
0
 def get_url_predictions(self, url, ignore_pos=True):
     html = urlopen(url)
     soup = BeautifulSoup(html.read())
     data = []
     for string in soup.strings:
         string = " ".join(re.split("[^a-zA-Z.,!?]*",
                                    string.lower())).strip()
         data.append(string)
     return self._text_predictions(data, ignore_pos=ignore_pos)
def read_data(filename):
    #issue = {}
    data = []
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            #issue[row[0]] = row
            data.append(row)
        return np.array(data)
Пример #17
0
def load_stance(filename):
    # reader = unicode_csv_reader(open(filename))
    reader = unicode_csv_reader(open(
        filename, errors='ignore'))  # NOTE: Changed for python3.
    data = []
    for title, id, stance in reader:
        clean_title = clean(title)
        clean_title = get_tokenized_lemmas(clean_title)
        data.append((clean_title, id, stance.strip()))
    return data
Пример #18
0
def analyze_content(_str):
    print 'warning, deprecated'
    _str = str(_str)
    html = urlopen(_str)
    soup = BeautifulSoup(html.read())
    data = []
    for string in soup.strings:
        string = " ".join(re.split("[^a-zA-Z.,!?]*", string.lower())).strip()
        data.append(string)
    return get_tweets_predictions(data).tolist()
Пример #19
0
def append_result(text, label):
    """
    appends text and label to objectivity.json
    """
    a_dict = {'text': text, 'label': label}

    with open('objectivity.json') as f:
        data = json.load(f)

    data.append(a_dict)

    with open('objectivity.json', 'w') as f:
        json.dump(data, f)
Пример #20
0
def loadFile(fileName):
    firstFile = ""
    with open(fileName, 'r') as cFile:
        firstFile = cFile.read()

    firstFile = firstFile.split('\n')

    data = []

    for line in firstFile:
        data.append(line.split(','))

    return data
Пример #21
0
def read_data(f):
    data = []
    for row in csv.reader(open(f), delimiter=';'):
        if row:
            plottext = row[8].decode('utf-8-sig')
            target = row[4]
            data.append((plottext, target))
    (X, Ycat) = zip(*data)
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(Ycat)
    global labels
    labels = le.inverse_transform([0,1,2,3,4])
    return (X, Y)
Пример #22
0
def read_data(file_name=''):
    data = []
    print('Reading data start', datetime.now())
    # file_name = 'Electronics_5.json'  # 13 secs loading time
    # file_name = 'Digital_Music_5.json'  # 1 sec loading time

    f = open(file_name, 'r')
    for line in f.readlines():
        tmp = json.loads(line)
        data.append([tmp['reviewText'], tmp['overall']])
    f.close()
    print('Reading data finsh', datetime.now())
    return data
Пример #23
0
def load_data(directory):
	data = list()
	files = list()
	for name in tqdm(os.listdir(directory)):
		try:
			filename = directory + '/' + name
			datum = load_doc(filename)
			document, summary = split_doc(datum)
			data.append({'document':document, 'summary':summary})
			files.append(filename)
		except UnicodeDecodeError:
			print(name)
	return data, files
Пример #24
0
def read_data(f):
    data = []
    for row in csv.reader(open(f), delimiter=';'):
        if row:
            plottext = row[8].decode('utf-8-sig')
            target = row[4]
            data.append((plottext, target))
    (X, Ycat) = zip(*data)
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(Ycat)
    global labels
    labels = le.inverse_transform([0,1,2,3,4])
    return (X, Y)
Пример #25
0
def json_encode(dic):
    """
    dictionary in
    json object out
    """
    print "TEST: json_encode call"

    data = []
    data.append(dic)
    data_string = json.dumps(data)
    print "TEST: json encoded ", data_string

    return data_string
Пример #26
0
def load_title(filename):
    data = []
    with open(filename, errors='ignore') as fh:
        reader = csv.DictReader(fh)
        raw_data = list(reader)
        for row in raw_data:
            title = str(row['Headline']).strip()
            clean_title = clean(title)
            clean_title = get_tokenized_lemmas(clean_title)

            id = row['Body ID']
            # ignore the stance if there is any
            data.append((clean_title, id))
    return data
def generate_dataset(corpus):  
    output = []
    for line in corpus:
        token_list = line
        for i in range(1, len(token_list)):
            data = []
            x_ngram = '<start> '+ token_list[:i+1] + ' <end>'
            y_ngram = '<start> '+ token_list[i+1:] + ' <end>'
            data.append(x_ngram)
            data.append(y_ngram)
            output.append(data)
    print("Dataset prepared with prefix and suffixes for teacher forcing technique")
    dummy_df = pd.DataFrame(output, columns=['input','output'])
    return output, dummy_df
Пример #28
0
 def get_json(self):
     self.__get_files()
     data = []
     for topic in self.files:
         rand = randint(100, 150)
         i = 0
         for file in self.files[topic]:
             content = FileReader(filePath=file).content()
             data.append({'category': topic, 'content': content})
             if i == rand:
                 break
             else:
                 i += 1
     return data
Пример #29
0
def read_data_eval(f):
    data = []
    for row in csv.reader(open(f), delimiter=';'):
        if row:
            plottext = row[8].decode('utf-8-sig')
            genre = {'genre': row[3]}
            year = row[2]
            target = row[4]
            data.append((plottext, genre, target, year))
    (X, genre, Ycat, year) = zip(*data)
    year = np.array(year, dtype='float')
    year = yearscale.transform(year).reshape((-1, 1))
    Y = le.transform(Ycat)
    genre = dv.transform(genre)
    return (X, Y, genre, year)
Пример #30
0
def read_data_eval(f):
    data = []
    for row in csv.reader(open(f), delimiter=';'):
        if row:
            plottext = row[8].decode('utf-8-sig')
            genre = { 'genre': row[3] }
            year = row[2]
            target = row[4]
            data.append((plottext,genre,target,year))
    (X, genre, Ycat,year) = zip(*data)
    year = np.array(year, dtype='float')
    year = yearscale.transform(year).reshape((-1,1))
    Y = le.transform(Ycat)
    genre = dv.transform(genre)
    return (X, Y, genre, year)
Пример #31
0
def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data
Пример #32
0
def addFileToData(filename, data):
    intColumns = ['No. Reader', 'No. Helpful', 'Cleanliness','Check in / front desk', 'Value', 'Overall', 'Service', 'Business service', 'Rooms', 'Location']
    characterThreshold = 60
    with open(filename, 'r') as content_file:
        content = content_file.read()
        
        #print(repr(content))
    if content.count("\r") > 0:
        reviews = content.split("\r\n\r\n")
    else:
        reviews = content.split("\n\n")
    
    for r in reviews:
        thisReview = pd.Series([None]*len(cats), cats)
        splt = r.split("\n")
        for s in splt:
            for c in cats:
                if "<"+c+">" in s:
                    value = s.replace('<'+c+'>', '')
                    if c in intColumns:
                        value = int(value)
                    if value == -1: #we dont want -1 as this is going to mess up averaging, take np.nan
                        value = np.nan

                    if c == "Content":
                        value = remove_non_ascii(value.lower())

                    thisReview[c] = value
                    
        if not thisReview["Content"] == None and len(thisReview["Content"]) > characterThreshold:
            #only add if theres content and its long enough
            data = data.append(thisReview, ignore_index=True)
    return data
Пример #33
0
def json_file(data_string, filename, fcnname ):
    """
    jsonEncoded data_string in, filename, and function name in
    json file out
    naming convention for json files:
        "filename(including ext).functionname.json"
    """
    print "TEST: json_file call"
    data = []
    data.append(data_string)
    f = open((str(filename) + '.' + str(fcnname) + ".json"), "w+")
    json.dump(data, f, separators=('},{',', '))

    f.flush()
    print "TEST: successful json dump and flush"
    return data_string
Пример #34
0
 def run(self):
     t0 = time()
     params = self.params
     numX = len(params.Xinds)
     data = []  
     Pw_zs = []    ####
     for i in range(numX):
         data.append(params.ws[i]*self.Xs[params.Xinds[i]])
         Pw_zs.append(self.initsDescriptor.Pw_zs[params.Xinds[i]])  ####
     self.initsDescriptor.Pw_zs = Pw_zs  ####
     sys.stderr.write('PLSABet run at node '+str(params.eventID)+'...\n')
     self.descriptor = pLSABet(data,self.initsDescriptor,params.lambdaB,
             params.selectTime,self.DT,params.wt,params.Learn,params.debug)  #
     if self.descriptor is None:
         sys.stderr.write("############### Pw_zs = None. Reduce K!!! \n") 
         exit(-1)
     print( "pLSA done in "+str(time() - t0))
Пример #35
0
def loadDoc(topic):
	docName=""
	if "." in topic:
		#docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic
		docName=topic
	else:
		# call bash script to get and fix wiki article

		cmd="bash "+CODE_DIR+"loader.sh "+'"'+topic.replace(' ', '_')+'"'


		#cmd="bash loader.sh \""+topic.replace+"\""
		#print "CMD =", cmd
		#subprocess.Popen(cmd)
		
		#process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
		#output = process.communicate()[0]

		subprocess.call(cmd, shell=True)


		docName=DOCS_DIR+topic.lower().replace(' ', '_')+".txt"


	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

	data=[]
	with open(docName) as f:
		while True:
			c = f.read(1)
			if not c:
				break
			
			if ord(c) < 32 or ord(c) > 126:
				data.append(' ')
			else:
				data.append(c)
			

	data = ''.join(data)

	parsedData = tokenizer.tokenize(data)
	newParsedData = [sentence.split() for sentence in parsedData]

	return newParsedData
Пример #36
0
def load_file(filePath):
    fname, ext = os.path.splitext(filePath)

    dictionary = {}
    data = []
    with open(filePath) as data_file:
        for line in data_file:
            data.append(json.loads(line))

    for d1 in data:
        bid = d1.get('business_id')
        review = d1.get('text')
        rid = d1.get('review_id')
        dict_temp = {bid: review}
        dictionary[rid] = dict_temp
        #print (dictionary)

    return dictionary
def save_to_elasticsearch(es: elasticsearch.Elasticsearch, bucket: str,
                          key: str, sentences: list):
    """

    :param es:
    :param bucket:
    :param key:
    :param sentences:
    :return:
    """
    parse_date = datetime.datetime.now()
    text_source_doc_id = bucket + "|" + key
    cik, form_type, as_of_date, company_name, edgar_file = key.split('|')
    cik = int(cik)
    as_of_date = datetime.datetime.strptime(as_of_date, '%Y%m%d').date()
    line_count = len(sentences)

    data = []
    text_source_action = {
        "cik": cik,
        "form_type": form_type,
        "as_of_date": as_of_date,
        "line_count": line_count,
        "parse_date": parse_date,
        "parser_version": __version__
    }

    res = es.index(index="text_source", body=text_source_action)
    text_source_id = res['_id']
    for line_number, content in enumerate(sentences, 1):
        line_action = {
            "_index": "text_line",
            "text_source_id": text_source_id,
            "content": content,
            "line_number": line_number,
            "as_of_date": as_of_date,
            "cik": cik,
            "form_type": form_type
        }
        data.append(line_action)

    _logger.info("Saving to elasticsearch: {text_source_doc_id}".format(
        text_source_doc_id=text_source_doc_id))
    helpers.bulk(es, data)
Пример #38
0
def read_data(f):
    data = []
    for row in csv.reader(open(f), delimiter=';'):
        if row:
            plottext = row[8].decode('utf-8-sig')
            genre = { 'genre': row[3] }
            year = row[2]
            target = row[4]
            data.append((plottext,genre,target,year))
    (X, genre, Ycat,year) = zip(*data)
    year = np.array(year, dtype='float')
    global yearscale
    yearscale = preprocessing.StandardScaler()
    year = yearscale.fit_transform(year).reshape((-1,1))
    global le
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(Ycat)
    global labels
    labels = le.inverse_transform([0,1,2,3,4])
    global dv
    dv = DictVectorizer(sparse=False)
    genre = dv.fit_transform(genre)
    return (X, Y, genre, year)
Пример #39
0
def jsonEncode(dic): # dictionary in json object out
    data = []
    data.append(dic)
    data_string = json.dumps(data, separators = (',',':'))
    print "JSON ENCODED: ", data_string
    return data_string
Пример #40
0
def jsonToFile(data_string, name): # dictionary in json file out "jsondata"
    data = []
    data.append(data_string)
    f = open((name + ".json"), "w+")
    json.dump(data, f, separators = (',',':'))
    f.flush()